220 | 745M | } Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb0EEEmPKh _ZN5doris11UnpackValueILi0ELi7ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi6ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi5ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi4ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi3ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi2ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi1ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi0ELi0ELb0EEEmPKh Line | Count | Source | 175 | 2.36k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.36k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 0 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 0 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 0 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 0 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 0 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 0 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | |
| 185 | 0 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 0 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 0 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 0 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 0 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 0 | constexpr bool READ_32_BITS = | 202 | 0 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | |
| 204 | 0 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 0 | } |
_ZN5doris11UnpackValueILi1ELi0ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 142 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 142 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 142 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 142 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi1ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 434k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 434k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 434k | return word & mask; | 208 | 434k | } | 209 | | | 210 | 64 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 64 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 64 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 64 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi2ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 26 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 26 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 26 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 26 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi3ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 434k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 434k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 434k | return word & mask; | 208 | 434k | } | 209 | | | 210 | 12 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 12 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 12 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 12 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi4ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 434k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 434k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 434k | return word & mask; | 208 | 434k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi5ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 434k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 434k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 434k | return word & mask; | 208 | 434k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi6ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 434k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 434k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 434k | return word & mask; | 208 | 434k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi7ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 434k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 434k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 434k | return word & mask; | 208 | 434k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi8ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 180 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 180 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 180 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 180 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi9ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 240 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 240 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 240 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 240 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi10ELb1EEEmPKh Line | Count | Source | 175 | 434k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 434k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 434k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 434k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 434k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 434k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 434k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 434k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 434k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 434k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 434k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 434k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 434k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 434k | constexpr bool READ_32_BITS = | 202 | 434k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 434k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 276 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 276 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 276 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 276 | return word & mask; | 220 | 434k | } |
_ZN5doris11UnpackValueILi1ELi11ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 170 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 170 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 170 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 170 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi12ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 176 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 176 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 176 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 176 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi13ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 114 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 114 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 114 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 114 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi14ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 98 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 98 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 98 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 98 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi15ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 68 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 68 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 68 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 68 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi16ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 82 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi17ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 108 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 108 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 108 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 108 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi18ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 132 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 132 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 132 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 132 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi19ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 82 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi20ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 118 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 118 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 118 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 118 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi21ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 100 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 100 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 100 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 100 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi22ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 86 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 86 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 86 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 86 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi23ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 58 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 58 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 58 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 58 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi24ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 26 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 26 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 26 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 26 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi25ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi26ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi27ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi28ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi29ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 16 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 16 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 16 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 16 | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi30ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 433k | } |
_ZN5doris11UnpackValueILi1ELi31ELb1EEEmPKh Line | Count | Source | 175 | 433k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 433k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 433k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 433k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 433k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 433k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 433k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 433k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 433k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 433k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 433k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 433k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 433k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 433k | constexpr bool READ_32_BITS = | 202 | 433k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 433k | if (READ_32_BITS) { | 205 | 433k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 433k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 433k | return word & mask; | 208 | 433k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 433k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi1ELi23ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi22ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi21ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi20ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 10 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 10 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 10 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 10 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi19ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi18ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi17ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi16ELb0EEEmPKh Line | Count | Source | 175 | 199k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 199k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 199k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 199k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 199k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 199k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 199k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 199k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 199k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 199k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 199k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 199k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 199k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 199k | constexpr bool READ_32_BITS = | 202 | 199k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 199k | if (READ_32_BITS) { | 205 | 199k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 199k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 199k | return word & mask; | 208 | 199k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 199k | } |
_ZN5doris11UnpackValueILi1ELi15ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi14ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi13ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi12ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 50 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 50 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 50 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 50 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi11ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 14 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 14 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 14 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 14 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi10ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi9ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi8ELb0EEEmPKh Line | Count | Source | 175 | 493k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 493k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 493k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 493k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 493k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 493k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 493k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 493k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 493k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 493k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 493k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 493k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 493k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 493k | constexpr bool READ_32_BITS = | 202 | 493k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 493k | if (READ_32_BITS) { | 205 | 493k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 493k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 493k | return word & mask; | 208 | 493k | } | 209 | | | 210 | 28 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 28 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 28 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 28 | return word & mask; | 220 | 493k | } |
_ZN5doris11UnpackValueILi1ELi7ELb0EEEmPKh Line | Count | Source | 175 | 968k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 968k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 968k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 968k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 968k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 968k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 968k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 968k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 968k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 968k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 968k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 968k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 968k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 968k | constexpr bool READ_32_BITS = | 202 | 968k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 968k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 146 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 146 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 146 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 146 | return word & mask; | 220 | 968k | } |
_ZN5doris11UnpackValueILi1ELi6ELb0EEEmPKh Line | Count | Source | 175 | 967k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 967k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 967k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 967k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 967k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 967k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 967k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 967k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 967k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 967k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 967k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 967k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 967k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 967k | constexpr bool READ_32_BITS = | 202 | 967k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 967k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 102 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 102 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 102 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 102 | return word & mask; | 220 | 967k | } |
_ZN5doris11UnpackValueILi1ELi5ELb0EEEmPKh Line | Count | Source | 175 | 968k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 968k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 968k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 968k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 968k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 968k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 968k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 968k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 968k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 968k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 968k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 968k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 968k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 968k | constexpr bool READ_32_BITS = | 202 | 968k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 968k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 78 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 78 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 78 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 78 | return word & mask; | 220 | 968k | } |
_ZN5doris11UnpackValueILi1ELi4ELb0EEEmPKh Line | Count | Source | 175 | 967k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 967k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 967k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 967k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 967k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 967k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 967k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 967k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 967k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 967k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 967k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 967k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 967k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 967k | constexpr bool READ_32_BITS = | 202 | 967k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 967k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 34 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 34 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 34 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 34 | return word & mask; | 220 | 967k | } |
_ZN5doris11UnpackValueILi1ELi3ELb0EEEmPKh Line | Count | Source | 175 | 967k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 967k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 967k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 967k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 967k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 967k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 967k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 967k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 967k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 967k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 967k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 967k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 967k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 967k | constexpr bool READ_32_BITS = | 202 | 967k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 967k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 28 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 28 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 28 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 28 | return word & mask; | 220 | 967k | } |
_ZN5doris11UnpackValueILi1ELi2ELb0EEEmPKh Line | Count | Source | 175 | 967k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 967k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 967k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 967k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 967k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 967k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 967k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 967k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 967k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 967k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 967k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 967k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 967k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 967k | constexpr bool READ_32_BITS = | 202 | 967k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 967k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 30 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 30 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 30 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 30 | return word & mask; | 220 | 967k | } |
_ZN5doris11UnpackValueILi1ELi1ELb0EEEmPKh Line | Count | Source | 175 | 967k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 967k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 967k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 967k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 967k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 967k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 967k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 967k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 967k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 967k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 967k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 967k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 967k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 967k | constexpr bool READ_32_BITS = | 202 | 967k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 967k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 26 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 26 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 26 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 26 | return word & mask; | 220 | 967k | } |
_ZN5doris11UnpackValueILi1ELi0ELb0EEEmPKh Line | Count | Source | 175 | 967k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 967k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 967k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 967k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 967k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 967k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 967k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 967k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 967k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 967k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 967k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 967k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 967k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 967k | constexpr bool READ_32_BITS = | 202 | 967k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 967k | if (READ_32_BITS) { | 205 | 967k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 967k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 967k | return word & mask; | 208 | 967k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 967k | } |
_ZN5doris11UnpackValueILi2ELi0ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 824 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 824 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 824 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 824 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi1ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 138 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 138 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 138 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 138 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi2ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.35k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.35k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.35k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.35k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi3ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.38k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.38k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.38k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.38k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi4ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 2.16k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.16k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.16k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.16k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi5ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.87k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.87k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.87k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.87k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi6ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 494 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 494 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 494 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 494 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi7ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi8ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.17k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.17k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.17k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.17k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi9ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.36k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.36k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.36k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.36k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi10ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.97k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.97k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.97k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.97k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi11ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.70k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.70k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.70k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.70k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi12ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.78k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.78k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.78k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.78k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi13ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.30k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.30k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.30k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.30k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi14ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.10k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.10k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.10k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.10k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi15ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.20k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.20k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.20k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.20k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi16ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi17ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi18ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 524 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 524 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 524 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 524 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi19ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 548 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 548 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 548 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 548 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi20ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 714 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 714 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 714 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 714 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi21ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 818 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 818 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 818 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 818 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi22ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 1.22k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.22k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.22k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.22k | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi23ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 106 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 106 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 106 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 106 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi24ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 52 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 52 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 52 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 52 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi25ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 48 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 48 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 48 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 48 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi26ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi27ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 782 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 782 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 782 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 782 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi28ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 774 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 774 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 774 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 774 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi29ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 358 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 358 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 358 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 358 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi30ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 664 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 664 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 664 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 664 | return word & mask; | 220 | 1.70M | } |
_ZN5doris11UnpackValueILi2ELi31ELb1EEEmPKh Line | Count | Source | 175 | 1.70M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.70M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.70M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.70M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.70M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.70M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.70M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.70M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.70M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.70M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.70M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.70M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.70M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.70M | constexpr bool READ_32_BITS = | 202 | 1.70M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.70M | if (READ_32_BITS) { | 205 | 1.70M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.70M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.70M | return word & mask; | 208 | 1.70M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 1.70M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi2ELi23ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi22ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 20 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 20 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 20 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 20 | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi21ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi20ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi19ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi18ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi17ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi16ELb0EEEmPKh Line | Count | Source | 175 | 213k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 213k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 213k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 213k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 213k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 213k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 213k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 213k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 213k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 213k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 213k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 213k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 213k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 213k | constexpr bool READ_32_BITS = | 202 | 213k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 213k | if (READ_32_BITS) { | 205 | 213k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 213k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 213k | return word & mask; | 208 | 213k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 213k | } |
_ZN5doris11UnpackValueILi2ELi15ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 212 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 212 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 212 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 212 | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi14ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi13ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 44 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 44 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 44 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 44 | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi12ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi11ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi10ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 12 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 12 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 12 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 12 | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi9ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi8ELb0EEEmPKh Line | Count | Source | 175 | 398k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 398k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 398k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 398k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 398k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 398k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 398k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 398k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 398k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 398k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 398k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 398k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 398k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 398k | constexpr bool READ_32_BITS = | 202 | 398k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 398k | if (READ_32_BITS) { | 205 | 398k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 398k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 398k | return word & mask; | 208 | 398k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 398k | } |
_ZN5doris11UnpackValueILi2ELi7ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi6ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 34 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 34 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 34 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 34 | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi5ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 18 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18 | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi4ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 32 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 32 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 32 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 32 | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi3ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi2ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 22 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 22 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 22 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 22 | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi1ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 28 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 28 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 28 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 28 | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi2ELi0ELb0EEEmPKh Line | Count | Source | 175 | 647k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 647k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 647k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 647k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 647k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 647k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 647k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 647k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 647k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 647k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 647k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 647k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 647k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 647k | constexpr bool READ_32_BITS = | 202 | 647k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 647k | if (READ_32_BITS) { | 205 | 647k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 647k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 647k | return word & mask; | 208 | 647k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 647k | } |
_ZN5doris11UnpackValueILi3ELi0ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi1ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi2ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi3ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi4ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi5ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi6ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi7ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi8ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi9ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi10ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi11ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi12ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi13ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi14ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi15ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi16ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi17ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi18ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi19ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi20ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi21ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.43M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.43M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.43M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.43M | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi22ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 24 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 24 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 24 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 24 | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi23ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 148 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 148 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 148 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 148 | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi24ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 1.13k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.13k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.13k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.13k | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi25ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 62 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 62 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 62 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 62 | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi26ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 138 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 138 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 138 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 138 | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi27ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 1.89k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.89k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.89k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.89k | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi28ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 78 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 78 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 78 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 78 | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi29ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 108 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 108 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 108 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 108 | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi30ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 1.68k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.68k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.68k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.68k | return word & mask; | 220 | 1.43M | } |
_ZN5doris11UnpackValueILi3ELi31ELb1EEEmPKh Line | Count | Source | 175 | 1.43M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.43M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.43M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.43M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.43M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.43M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.43M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.43M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.43M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.43M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.43M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.43M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.43M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.43M | constexpr bool READ_32_BITS = | 202 | 1.43M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.43M | if (READ_32_BITS) { | 205 | 1.43M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.43M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.43M | return word & mask; | 208 | 1.43M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 1.43M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi3ELi23ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.9k | return word & mask; | 208 | 96.9k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi22ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.9k | return word & mask; | 208 | 96.9k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi21ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 96.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 96.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 96.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 96.9k | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi20ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.9k | return word & mask; | 208 | 96.9k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi19ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.9k | return word & mask; | 208 | 96.9k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi18ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.9k | return word & mask; | 208 | 96.9k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi17ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.8k | return word & mask; | 208 | 96.8k | } | 209 | | | 210 | 48 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 48 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 48 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 48 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi16ELb0EEEmPKh Line | Count | Source | 175 | 96.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 96.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 96.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 96.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 96.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 96.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 96.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 96.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 96.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 96.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 96.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 96.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 96.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 96.9k | constexpr bool READ_32_BITS = | 202 | 96.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 96.9k | if (READ_32_BITS) { | 205 | 96.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 96.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 96.9k | return word & mask; | 208 | 96.9k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 96.9k | } |
_ZN5doris11UnpackValueILi3ELi15ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi14ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi13ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi12ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi11ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi10ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 100k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 100k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 100k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 100k | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi9ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi8ELb0EEEmPKh Line | Count | Source | 175 | 100k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 100k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 100k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 100k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 100k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 100k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 100k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 100k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 100k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 100k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 100k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 100k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 100k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 100k | constexpr bool READ_32_BITS = | 202 | 100k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 100k | if (READ_32_BITS) { | 205 | 100k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 100k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 100k | return word & mask; | 208 | 100k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 100k | } |
_ZN5doris11UnpackValueILi3ELi7ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi6ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi5ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi4ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi3ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi2ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi1ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 113k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi3ELi0ELb0EEEmPKh Line | Count | Source | 175 | 113k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 113k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 113k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 113k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 113k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 113k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 113k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 113k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 113k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 113k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 113k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 113k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 113k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 113k | constexpr bool READ_32_BITS = | 202 | 113k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 113k | if (READ_32_BITS) { | 205 | 113k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 113k | return word & mask; | 208 | 113k | } | 209 | | | 210 | 78 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 78 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 78 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 78 | return word & mask; | 220 | 113k | } |
_ZN5doris11UnpackValueILi4ELi0ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 672 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 672 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 672 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 672 | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi1ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 1.25k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.25k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.25k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.25k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi2ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 1.95k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.95k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.95k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.95k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi3ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi4ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 968 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 968 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 968 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 968 | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi5ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi6ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 334 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 334 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 334 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 334 | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi7ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 804 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 804 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 804 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 804 | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi8ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi9ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 3.86k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.86k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.86k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.86k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi10ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 4.54k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4.54k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4.54k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4.54k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi11ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 6.70k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6.70k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6.70k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6.70k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi12ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 4.17k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4.17k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4.17k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4.17k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi13ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4.00k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4.00k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4.00k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi14ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 3.15k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.15k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.15k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.15k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi15ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 3.11k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.11k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.11k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.11k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi16ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 804 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 804 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 804 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 804 | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi17ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 1.24k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.24k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.24k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.24k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi18ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 3.58k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.58k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.58k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.58k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi19ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 4.27k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4.27k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4.27k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4.27k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi20ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 4.40k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4.40k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4.40k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4.40k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi21ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 2.95k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.95k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.95k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.95k | return word & mask; | 220 | 4.99M | } |
_ZN5doris11UnpackValueILi4ELi22ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 2.96k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.96k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.96k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.96k | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi23ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 1.35k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.35k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.35k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.35k | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi24ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 432 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 432 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 432 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 432 | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi25ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 1.29k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.29k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.29k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.29k | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi26ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 1.24k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.24k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.24k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.24k | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi27ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 134 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 134 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 134 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 134 | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi28ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi29ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 776 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 776 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 776 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 776 | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi30ELb1EEEmPKh Line | Count | Source | 175 | 4.98M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.98M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.98M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.98M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.98M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.98M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.98M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.98M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.98M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.98M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.98M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.98M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.98M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.98M | constexpr bool READ_32_BITS = | 202 | 4.98M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.98M | if (READ_32_BITS) { | 205 | 4.98M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.98M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.98M | return word & mask; | 208 | 4.98M | } | 209 | | | 210 | 492 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 492 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 492 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 492 | return word & mask; | 220 | 4.98M | } |
_ZN5doris11UnpackValueILi4ELi31ELb1EEEmPKh Line | Count | Source | 175 | 4.99M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 4.99M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 4.99M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 4.99M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 4.99M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 4.99M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 4.99M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 4.99M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 4.99M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 4.99M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 4.99M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 4.99M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 4.99M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 4.99M | constexpr bool READ_32_BITS = | 202 | 4.99M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 4.99M | if (READ_32_BITS) { | 205 | 4.99M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 4.99M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 4.99M | return word & mask; | 208 | 4.99M | } | 209 | | | 210 | 74 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 74 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 74 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 74 | return word & mask; | 220 | 4.99M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi4ELi23ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi22ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi21ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi20ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi19ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi18ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi17ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi16ELb0EEEmPKh Line | Count | Source | 175 | 370k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 370k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 370k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 370k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 370k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 370k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 370k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 370k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 370k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 370k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 370k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 370k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 370k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 370k | constexpr bool READ_32_BITS = | 202 | 370k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 370k | if (READ_32_BITS) { | 205 | 370k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 370k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 370k | return word & mask; | 208 | 370k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 370k | } |
_ZN5doris11UnpackValueILi4ELi15ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi14ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi13ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi12ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi11ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi10ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi9ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi8ELb0EEEmPKh Line | Count | Source | 175 | 400k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 400k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 400k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 400k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 400k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 400k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 400k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 400k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 400k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 400k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 400k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 400k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 400k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 400k | constexpr bool READ_32_BITS = | 202 | 400k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 400k | if (READ_32_BITS) { | 205 | 400k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 400k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 400k | return word & mask; | 208 | 400k | } | 209 | | | 210 | 34 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 34 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 34 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 34 | return word & mask; | 220 | 400k | } |
_ZN5doris11UnpackValueILi4ELi7ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi6ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi5ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi4ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi3ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi2ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi1ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi4ELi0ELb0EEEmPKh Line | Count | Source | 175 | 401k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 401k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 401k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 401k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 401k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 401k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 401k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 401k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 401k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 401k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 401k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 401k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 401k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 401k | constexpr bool READ_32_BITS = | 202 | 401k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 401k | if (READ_32_BITS) { | 205 | 401k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 401k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 401k | return word & mask; | 208 | 401k | } | 209 | | | 210 | 12 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 12 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 12 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 12 | return word & mask; | 220 | 401k | } |
_ZN5doris11UnpackValueILi5ELi0ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi1ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi2ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi3ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi4ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi5ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi6ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi7ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi8ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi9ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi10ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi11ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi12ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi13ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi14ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi15ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi16ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi17ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi18ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi19ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi20ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi21ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi22ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi23ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi24ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi25ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.1k | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi26ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 11.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.1k | return word & mask; | 208 | 11.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi27ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 11.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.1k | return word & mask; | 208 | 11.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi28ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 11.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.1k | return word & mask; | 208 | 11.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi29ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 11.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.1k | return word & mask; | 208 | 11.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi30ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 11.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.1k | return word & mask; | 208 | 11.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.1k | } |
_ZN5doris11UnpackValueILi5ELi31ELb1EEEmPKh Line | Count | Source | 175 | 11.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.1k | constexpr bool READ_32_BITS = | 202 | 11.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.1k | if (READ_32_BITS) { | 205 | 11.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.1k | return word & mask; | 208 | 11.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.1k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi5ELi23ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi22ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi21ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi20ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi19ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 716 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 716 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 716 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 716 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi18ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi17ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi16ELb0EEEmPKh Line | Count | Source | 175 | 716 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 716 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 716 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 716 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 716 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 716 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 716 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 716 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 716 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 716 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 716 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 716 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 716 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 716 | constexpr bool READ_32_BITS = | 202 | 716 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 716 | if (READ_32_BITS) { | 205 | 716 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 716 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 716 | return word & mask; | 208 | 716 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 716 | } |
_ZN5doris11UnpackValueILi5ELi15ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi14ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi13ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi12ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 848 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 848 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 848 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 848 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi11ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi10ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi9ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi8ELb0EEEmPKh Line | Count | Source | 175 | 848 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 848 | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 848 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 848 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 848 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 848 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 848 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 848 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 848 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 848 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 848 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 848 | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 848 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 848 | constexpr bool READ_32_BITS = | 202 | 848 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 848 | if (READ_32_BITS) { | 205 | 848 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 848 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 848 | return word & mask; | 208 | 848 | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 848 | } |
_ZN5doris11UnpackValueILi5ELi7ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi6ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.76k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.76k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.76k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.76k | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi5ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi4ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi3ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi2ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi1ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi5ELi0ELb0EEEmPKh Line | Count | Source | 175 | 1.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.76k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.76k | constexpr bool READ_32_BITS = | 202 | 1.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.76k | if (READ_32_BITS) { | 205 | 1.76k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.76k | return word & mask; | 208 | 1.76k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.76k | } |
_ZN5doris11UnpackValueILi6ELi0ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi1ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi2ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi3ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi4ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi5ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi6ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi7ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi8ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi9ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi10ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi11ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi12ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi13ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi14ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi15ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi16ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi17ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi18ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi19ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi20ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi21ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi22ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi23ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi24ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi25ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi26ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.26M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.26M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.26M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.26M | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi27ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 3.26M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.26M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.26M | return word & mask; | 208 | 3.26M | } | 209 | | | 210 | 448 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 448 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 448 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 448 | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi28ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 3.26M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.26M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.26M | return word & mask; | 208 | 3.26M | } | 209 | | | 210 | 484 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 484 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 484 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 484 | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi29ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 3.26M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.26M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.26M | return word & mask; | 208 | 3.26M | } | 209 | | | 210 | 782 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 782 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 782 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 782 | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi30ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 3.26M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.26M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.26M | return word & mask; | 208 | 3.26M | } | 209 | | | 210 | 2.51k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.51k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.51k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.51k | return word & mask; | 220 | 3.26M | } |
_ZN5doris11UnpackValueILi6ELi31ELb1EEEmPKh Line | Count | Source | 175 | 3.26M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.26M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.26M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.26M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.26M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.26M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.26M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.26M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.26M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.26M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.26M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.26M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.26M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.26M | constexpr bool READ_32_BITS = | 202 | 3.26M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.26M | if (READ_32_BITS) { | 205 | 3.26M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.26M | return word & mask; | 208 | 3.26M | } | 209 | | | 210 | 484 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 484 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 484 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 484 | return word & mask; | 220 | 3.26M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi6ELi23ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi22ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi21ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 220k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 220k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 220k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 220k | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi20ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi19ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi18ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi17ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi16ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi15ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi14ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi13ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi12ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 18 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi11ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi10ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 220k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 220k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 220k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 220k | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi9ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi8ELb0EEEmPKh Line | Count | Source | 175 | 220k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 220k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 220k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 220k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 220k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 220k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 220k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 220k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 220k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 220k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 220k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 220k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 220k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 220k | constexpr bool READ_32_BITS = | 202 | 220k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 220k | if (READ_32_BITS) { | 205 | 220k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 220k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 220k | return word & mask; | 208 | 220k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 220k | } |
_ZN5doris11UnpackValueILi6ELi7ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi6ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 222k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi5ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 222k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 222k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 222k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 222k | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi4ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 222k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi3ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 222k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi2ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 222k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi1ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 222k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi6ELi0ELb0EEEmPKh Line | Count | Source | 175 | 222k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 222k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 222k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 222k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 222k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 222k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 222k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 222k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 222k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 222k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 222k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 222k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 222k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 222k | constexpr bool READ_32_BITS = | 202 | 222k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 222k | if (READ_32_BITS) { | 205 | 222k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 222k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 222k | return word & mask; | 208 | 222k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 222k | } |
_ZN5doris11UnpackValueILi7ELi0ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi1ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi2ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi3ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi4ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi5ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi6ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi7ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi8ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi9ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi10ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi11ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi12ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi13ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi14ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi15ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi16ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi17ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi18ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi19ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi20ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi21ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi22ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi23ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi24ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi25ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi26ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi27ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.3k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.3k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.3k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.3k | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi28ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 37.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.3k | return word & mask; | 208 | 37.3k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi29ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 37.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.3k | return word & mask; | 208 | 37.3k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi30ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 37.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.3k | return word & mask; | 208 | 37.3k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.3k | } |
_ZN5doris11UnpackValueILi7ELi31ELb1EEEmPKh Line | Count | Source | 175 | 37.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.3k | constexpr bool READ_32_BITS = | 202 | 37.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.3k | if (READ_32_BITS) { | 205 | 37.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.3k | return word & mask; | 208 | 37.3k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.3k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi7ELi23ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 1.67k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.67k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.67k | return word & mask; | 208 | 1.67k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi22ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.67k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.67k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.67k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.67k | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi21ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 1.67k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.67k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.67k | return word & mask; | 208 | 1.67k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi20ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 1.67k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.67k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.67k | return word & mask; | 208 | 1.67k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi19ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 1.67k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.67k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.67k | return word & mask; | 208 | 1.67k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi18ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.67k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.67k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.67k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.67k | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi17ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 1.67k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.67k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.67k | return word & mask; | 208 | 1.67k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi16ELb0EEEmPKh Line | Count | Source | 175 | 1.67k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.67k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.67k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.67k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.67k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.67k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.67k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.67k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.67k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.67k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.67k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.67k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.67k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.67k | constexpr bool READ_32_BITS = | 202 | 1.67k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.67k | if (READ_32_BITS) { | 205 | 1.67k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.67k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.67k | return word & mask; | 208 | 1.67k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.67k | } |
_ZN5doris11UnpackValueILi7ELi15ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 2.24k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.24k | return word & mask; | 208 | 2.24k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi14ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 2.24k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.24k | return word & mask; | 208 | 2.24k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi13ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24k | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi12ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 2.24k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.24k | return word & mask; | 208 | 2.24k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi11ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 2.24k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.24k | return word & mask; | 208 | 2.24k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi10ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 2.24k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.24k | return word & mask; | 208 | 2.24k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi9ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24k | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi8ELb0EEEmPKh Line | Count | Source | 175 | 2.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24k | constexpr bool READ_32_BITS = | 202 | 2.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24k | if (READ_32_BITS) { | 205 | 2.24k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.24k | return word & mask; | 208 | 2.24k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.24k | } |
_ZN5doris11UnpackValueILi7ELi7ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi6ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi5ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi4ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 7.25k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 7.25k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 7.25k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 7.25k | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi3ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi2ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi1ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi7ELi0ELb0EEEmPKh Line | Count | Source | 175 | 7.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.25k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.25k | constexpr bool READ_32_BITS = | 202 | 7.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.25k | if (READ_32_BITS) { | 205 | 7.25k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.25k | return word & mask; | 208 | 7.25k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.25k | } |
_ZN5doris11UnpackValueILi8ELi0ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi1ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi2ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi3ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi4ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi5ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi6ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi7ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi8ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi9ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi10ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi11ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi12ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi13ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi14ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi15ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi16ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi17ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi18ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi19ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi20ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi21ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi22ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi23ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi24ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi25ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi26ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi27ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi28ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi29ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi30ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
_ZN5doris11UnpackValueILi8ELi31ELb1EEEmPKh Line | Count | Source | 175 | 20.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 20.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 20.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 20.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 20.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 20.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 20.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 20.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 20.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 20.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 20.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 20.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 20.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 20.4k | constexpr bool READ_32_BITS = | 202 | 20.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 20.4k | if (READ_32_BITS) { | 205 | 20.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 20.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 20.4k | return word & mask; | 208 | 20.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 20.4k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi8ELi23ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi22ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi21ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi20ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi19ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi18ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi17ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi16ELb0EEEmPKh Line | Count | Source | 175 | 1.54k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.54k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.54k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.54k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.54k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.54k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.54k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.54k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.54k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.54k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.54k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.54k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.54k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.54k | constexpr bool READ_32_BITS = | 202 | 1.54k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.54k | if (READ_32_BITS) { | 205 | 1.54k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.54k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.54k | return word & mask; | 208 | 1.54k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 1.54k | } |
_ZN5doris11UnpackValueILi8ELi15ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi14ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi13ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi12ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi11ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi10ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi9ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi8ELb0EEEmPKh Line | Count | Source | 175 | 2.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.18k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.18k | constexpr bool READ_32_BITS = | 202 | 2.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.18k | if (READ_32_BITS) { | 205 | 2.18k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.18k | return word & mask; | 208 | 2.18k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.18k | } |
_ZN5doris11UnpackValueILi8ELi7ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi6ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi5ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi4ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi3ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi2ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi1ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi8ELi0ELb0EEEmPKh Line | Count | Source | 175 | 7.80k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 7.80k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 7.80k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 7.80k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 7.80k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 7.80k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 7.80k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 7.80k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 7.80k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 7.80k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 7.80k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 7.80k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 7.80k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 7.80k | constexpr bool READ_32_BITS = | 202 | 7.80k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 7.80k | if (READ_32_BITS) { | 205 | 7.80k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 7.80k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 7.80k | return word & mask; | 208 | 7.80k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 7.80k | } |
_ZN5doris11UnpackValueILi9ELi0ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi1ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi2ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi3ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi4ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi5ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi6ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi7ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi8ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi9ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi10ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi11ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi12ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi13ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi14ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi15ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi16ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi17ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi18ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi19ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi20ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi21ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi22ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi23ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi24ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi25ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi26ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi27ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi28ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 38.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 38.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 38.1k | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi29ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 38.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 38.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 38.1k | return word & mask; | 208 | 38.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi30ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 38.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 38.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 38.1k | return word & mask; | 208 | 38.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 38.1k | } |
_ZN5doris11UnpackValueILi9ELi31ELb1EEEmPKh Line | Count | Source | 175 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 38.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 38.1k | constexpr bool READ_32_BITS = | 202 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 38.1k | if (READ_32_BITS) { | 205 | 38.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 38.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 38.1k | return word & mask; | 208 | 38.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 38.1k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi9ELi23ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 2.89k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.89k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.89k | return word & mask; | 208 | 2.89k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi22ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 2.89k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.89k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.89k | return word & mask; | 208 | 2.89k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi21ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.89k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.89k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.89k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.89k | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi20ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 2.89k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.89k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.89k | return word & mask; | 208 | 2.89k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi19ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 2.89k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.89k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.89k | return word & mask; | 208 | 2.89k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi18ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 2.89k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.89k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.89k | return word & mask; | 208 | 2.89k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi17ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.89k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.89k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.89k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.89k | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi16ELb0EEEmPKh Line | Count | Source | 175 | 2.89k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.89k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.89k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.89k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.89k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.89k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.89k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.89k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.89k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.89k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.89k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.89k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.89k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.89k | constexpr bool READ_32_BITS = | 202 | 2.89k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.89k | if (READ_32_BITS) { | 205 | 2.89k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.89k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.89k | return word & mask; | 208 | 2.89k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.89k | } |
_ZN5doris11UnpackValueILi9ELi15ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 3.97k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.97k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.97k | return word & mask; | 208 | 3.97k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi14ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.97k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.97k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.97k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.97k | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi13ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 3.97k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.97k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.97k | return word & mask; | 208 | 3.97k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi12ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 3.97k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.97k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.97k | return word & mask; | 208 | 3.97k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi11ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 3.97k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.97k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.97k | return word & mask; | 208 | 3.97k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi10ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.97k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.97k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.97k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.97k | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi9ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 3.97k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.97k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.97k | return word & mask; | 208 | 3.97k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi8ELb0EEEmPKh Line | Count | Source | 175 | 3.97k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.97k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.97k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.97k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.97k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.97k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.97k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.97k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.97k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.97k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.97k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.97k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.97k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.97k | constexpr bool READ_32_BITS = | 202 | 3.97k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.97k | if (READ_32_BITS) { | 205 | 3.97k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.97k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.97k | return word & mask; | 208 | 3.97k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.97k | } |
_ZN5doris11UnpackValueILi9ELi7ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 13.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 13.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 13.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 13.1k | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi6ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 13.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 13.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 13.1k | return word & mask; | 208 | 13.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi5ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 13.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 13.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 13.1k | return word & mask; | 208 | 13.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi4ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 13.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 13.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 13.1k | return word & mask; | 208 | 13.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi3ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 13.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 13.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 13.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 13.1k | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi2ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 13.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 13.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 13.1k | return word & mask; | 208 | 13.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi1ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 13.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 13.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 13.1k | return word & mask; | 208 | 13.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi9ELi0ELb0EEEmPKh Line | Count | Source | 175 | 13.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 13.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 13.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 13.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 13.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 13.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 13.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 13.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 13.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 13.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 13.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 13.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 13.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 13.1k | constexpr bool READ_32_BITS = | 202 | 13.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 13.1k | if (READ_32_BITS) { | 205 | 13.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 13.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 13.1k | return word & mask; | 208 | 13.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 13.1k | } |
_ZN5doris11UnpackValueILi10ELi0ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi1ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi2ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi3ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi4ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi5ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi6ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi7ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi8ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi9ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi10ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi11ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi12ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi13ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi14ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi15ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi16ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi17ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi18ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi19ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi20ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi21ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi22ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi23ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi24ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi25ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi26ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi27ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi28ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 36.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 36.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 36.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 36.4k | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi29ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 36.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 36.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 36.4k | return word & mask; | 208 | 36.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi30ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 36.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 36.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 36.4k | return word & mask; | 208 | 36.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 36.4k | } |
_ZN5doris11UnpackValueILi10ELi31ELb1EEEmPKh Line | Count | Source | 175 | 36.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 36.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 36.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 36.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 36.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 36.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 36.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 36.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 36.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 36.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 36.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 36.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 36.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 36.4k | constexpr bool READ_32_BITS = | 202 | 36.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 36.4k | if (READ_32_BITS) { | 205 | 36.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 36.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 36.4k | return word & mask; | 208 | 36.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 36.4k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi10ELi23ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 2.86k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.86k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.86k | return word & mask; | 208 | 2.86k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi22ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.86k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.86k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.86k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.86k | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi21ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 2.86k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.86k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.86k | return word & mask; | 208 | 2.86k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi20ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 2.86k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.86k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.86k | return word & mask; | 208 | 2.86k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi19ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.86k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.86k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.86k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.86k | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi18ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 2.86k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.86k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.86k | return word & mask; | 208 | 2.86k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi17ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 2.86k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.86k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.86k | return word & mask; | 208 | 2.86k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi16ELb0EEEmPKh Line | Count | Source | 175 | 2.86k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.86k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.86k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.86k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.86k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.86k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.86k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.86k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.86k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.86k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.86k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.86k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.86k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.86k | constexpr bool READ_32_BITS = | 202 | 2.86k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.86k | if (READ_32_BITS) { | 205 | 2.86k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.86k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.86k | return word & mask; | 208 | 2.86k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 2.86k | } |
_ZN5doris11UnpackValueILi10ELi15ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 3.31k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.31k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.31k | return word & mask; | 208 | 3.31k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi14ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 3.31k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.31k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.31k | return word & mask; | 208 | 3.31k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi13ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 3.31k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.31k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.31k | return word & mask; | 208 | 3.31k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi12ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.31k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.31k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.31k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.31k | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi11ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 3.31k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.31k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.31k | return word & mask; | 208 | 3.31k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi10ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 3.31k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.31k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.31k | return word & mask; | 208 | 3.31k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi9ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.31k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.31k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.31k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.31k | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi8ELb0EEEmPKh Line | Count | Source | 175 | 3.31k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.31k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.31k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.31k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.31k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.31k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.31k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.31k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.31k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.31k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.31k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.31k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.31k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.31k | constexpr bool READ_32_BITS = | 202 | 3.31k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.31k | if (READ_32_BITS) { | 205 | 3.31k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.31k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.31k | return word & mask; | 208 | 3.31k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 3.31k | } |
_ZN5doris11UnpackValueILi10ELi7ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 11.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.5k | return word & mask; | 208 | 11.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi6ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.5k | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi5ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 11.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.5k | return word & mask; | 208 | 11.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi4ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 11.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.5k | return word & mask; | 208 | 11.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi3ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 11.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 11.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 11.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 11.5k | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi2ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 11.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.5k | return word & mask; | 208 | 11.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi1ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 11.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.5k | return word & mask; | 208 | 11.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi10ELi0ELb0EEEmPKh Line | Count | Source | 175 | 11.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 11.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 11.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 11.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 11.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 11.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 11.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 11.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 11.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 11.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 11.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 11.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 11.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 11.5k | constexpr bool READ_32_BITS = | 202 | 11.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 11.5k | if (READ_32_BITS) { | 205 | 11.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 11.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 11.5k | return word & mask; | 208 | 11.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 11.5k | } |
_ZN5doris11UnpackValueILi11ELi0ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi1ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi2ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi3ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi4ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi5ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi6ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi7ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi8ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi9ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi10ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi11ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi12ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi13ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi14ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi15ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi16ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi17ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi18ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi19ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi20ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi21ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi22ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi23ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi24ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi25ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi26ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi27ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi28ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi29ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 518k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 518k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 518k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 518k | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi30ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 518k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 518k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 518k | return word & mask; | 208 | 518k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 518k | } |
_ZN5doris11UnpackValueILi11ELi31ELb1EEEmPKh Line | Count | Source | 175 | 518k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 518k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 518k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 518k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 518k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 518k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 518k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 518k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 518k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 518k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 518k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 518k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 518k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 518k | constexpr bool READ_32_BITS = | 202 | 518k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 518k | if (READ_32_BITS) { | 205 | 518k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 518k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 518k | return word & mask; | 208 | 518k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 518k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi11ELi23ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 35.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 35.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 35.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 35.5k | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi22ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 35.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.5k | return word & mask; | 208 | 35.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi21ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 35.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.5k | return word & mask; | 208 | 35.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi20ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 35.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 35.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 35.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 35.5k | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi19ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 35.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.5k | return word & mask; | 208 | 35.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi18ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 35.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.5k | return word & mask; | 208 | 35.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi17ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 35.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 35.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 35.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 35.5k | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi16ELb0EEEmPKh Line | Count | Source | 175 | 35.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.5k | constexpr bool READ_32_BITS = | 202 | 35.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.5k | if (READ_32_BITS) { | 205 | 35.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.5k | return word & mask; | 208 | 35.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.5k | } |
_ZN5doris11UnpackValueILi11ELi15ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 35.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.7k | return word & mask; | 208 | 35.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi14ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 35.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 35.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 35.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 35.7k | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi13ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 35.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.7k | return word & mask; | 208 | 35.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi12ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 35.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.7k | return word & mask; | 208 | 35.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi11ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 35.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 35.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 35.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 35.7k | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi10ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 35.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.7k | return word & mask; | 208 | 35.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi9ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 35.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 35.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 35.7k | return word & mask; | 208 | 35.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi8ELb0EEEmPKh Line | Count | Source | 175 | 35.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 35.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 35.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 35.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 35.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 35.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 35.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 35.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 35.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 35.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 35.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 35.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 35.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 35.7k | constexpr bool READ_32_BITS = | 202 | 35.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 35.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 35.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 35.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 35.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 35.7k | return word & mask; | 220 | 35.7k | } |
_ZN5doris11UnpackValueILi11ELi7ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 40.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 40.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 40.0k | return word & mask; | 208 | 40.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi6ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 40.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 40.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 40.0k | return word & mask; | 208 | 40.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi5ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 40.0k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 40.0k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 40.0k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 40.0k | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi4ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 40.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 40.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 40.0k | return word & mask; | 208 | 40.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi3ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 40.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 40.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 40.0k | return word & mask; | 208 | 40.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi2ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 40.0k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 40.0k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 40.0k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 40.0k | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi1ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 40.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 40.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 40.0k | return word & mask; | 208 | 40.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi11ELi0ELb0EEEmPKh Line | Count | Source | 175 | 40.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 40.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 40.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 40.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 40.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 40.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 40.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 40.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 40.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 40.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 40.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 40.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 40.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 40.0k | constexpr bool READ_32_BITS = | 202 | 40.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 40.0k | if (READ_32_BITS) { | 205 | 40.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 40.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 40.0k | return word & mask; | 208 | 40.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 40.0k | } |
_ZN5doris11UnpackValueILi12ELi0ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi1ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi2ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi3ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi4ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi5ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi6ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi7ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi8ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi9ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi10ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi11ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi12ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi13ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi14ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi15ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi16ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi17ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi18ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi19ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi20ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi21ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi22ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi23ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi24ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi25ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi26ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi27ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi28ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi29ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 3.78M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.78M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.78M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.78M | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi30ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 3.78M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.78M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.78M | return word & mask; | 208 | 3.78M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 3.78M | } |
_ZN5doris11UnpackValueILi12ELi31ELb1EEEmPKh Line | Count | Source | 175 | 3.78M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 3.78M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 3.78M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 3.78M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 3.78M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 3.78M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 3.78M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 3.78M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 3.78M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 3.78M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 3.78M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 3.78M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 3.78M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 3.78M | constexpr bool READ_32_BITS = | 202 | 3.78M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 3.78M | if (READ_32_BITS) { | 205 | 3.78M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 3.78M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 3.78M | return word & mask; | 208 | 3.78M | } | 209 | | | 210 | 34 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 34 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 34 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 34 | return word & mask; | 220 | 3.78M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi12ELi23ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 256k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 256k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 256k | return word & mask; | 208 | 256k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi22ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 256k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 256k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 256k | return word & mask; | 208 | 256k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi21ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 256k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 256k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 256k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 256k | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi20ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 256k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 256k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 256k | return word & mask; | 208 | 256k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi19ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 256k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 256k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 256k | return word & mask; | 208 | 256k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi18ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 256k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 256k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 256k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 256k | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi17ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 256k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 256k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 256k | return word & mask; | 208 | 256k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi16ELb0EEEmPKh Line | Count | Source | 175 | 256k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 256k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 256k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 256k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 256k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 256k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 256k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 256k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 256k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 256k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 256k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 256k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 256k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 256k | constexpr bool READ_32_BITS = | 202 | 256k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 256k | if (READ_32_BITS) { | 205 | 256k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 256k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 256k | return word & mask; | 208 | 256k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 256k | } |
_ZN5doris11UnpackValueILi12ELi15ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 257k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 257k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 257k | return word & mask; | 208 | 257k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi14ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 257k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 257k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 257k | return word & mask; | 208 | 257k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi13ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 257k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 257k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 257k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 257k | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi12ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 257k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 257k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 257k | return word & mask; | 208 | 257k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi11ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 257k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 257k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 257k | return word & mask; | 208 | 257k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi10ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 257k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 257k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 257k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 257k | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi9ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 257k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 257k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 257k | return word & mask; | 208 | 257k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi8ELb0EEEmPKh Line | Count | Source | 175 | 257k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 257k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 257k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 257k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 257k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 257k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 257k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 257k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 257k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 257k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 257k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 257k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 257k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 257k | constexpr bool READ_32_BITS = | 202 | 257k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 257k | if (READ_32_BITS) { | 205 | 257k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 257k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 257k | return word & mask; | 208 | 257k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 257k | } |
_ZN5doris11UnpackValueILi12ELi7ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 266k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 266k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 266k | return word & mask; | 208 | 266k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi6ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 266k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 266k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 266k | return word & mask; | 208 | 266k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi5ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 266k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 266k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 266k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 266k | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi4ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 266k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 266k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 266k | return word & mask; | 208 | 266k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi3ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 266k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 266k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 266k | return word & mask; | 208 | 266k | } | 209 | | | 210 | 10 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 10 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 10 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 10 | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi2ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 266k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 266k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 266k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 266k | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi1ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 266k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 266k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 266k | return word & mask; | 208 | 266k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi12ELi0ELb0EEEmPKh Line | Count | Source | 175 | 266k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 266k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 266k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 266k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 266k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 266k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 266k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 266k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 266k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 266k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 266k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 266k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 266k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 266k | constexpr bool READ_32_BITS = | 202 | 266k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 266k | if (READ_32_BITS) { | 205 | 266k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 266k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 266k | return word & mask; | 208 | 266k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 266k | } |
_ZN5doris11UnpackValueILi13ELi0ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi1ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi2ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi3ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi4ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi5ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi6ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi7ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi8ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi9ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi10ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi11ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi12ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi13ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi14ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi15ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi16ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi17ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi18ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi19ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi20ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi21ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi22ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi23ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi24ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi25ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi26ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi27ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi28ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi29ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 350k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 350k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 350k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 350k | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi30ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 350k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 350k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 350k | return word & mask; | 208 | 350k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 350k | } |
_ZN5doris11UnpackValueILi13ELi31ELb1EEEmPKh Line | Count | Source | 175 | 350k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 350k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 350k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 350k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 350k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 350k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 350k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 350k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 350k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 350k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 350k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 350k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 350k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 350k | constexpr bool READ_32_BITS = | 202 | 350k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 350k | if (READ_32_BITS) { | 205 | 350k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 350k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 350k | return word & mask; | 208 | 350k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 350k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi13ELi23ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 27.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 27.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 27.4k | return word & mask; | 208 | 27.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi22ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 27.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 27.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 27.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 27.4k | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi21ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 27.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 27.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 27.4k | return word & mask; | 208 | 27.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi20ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 27.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 27.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 27.4k | return word & mask; | 208 | 27.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi19ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 27.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 27.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 27.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 27.4k | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi18ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 27.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 27.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 27.4k | return word & mask; | 208 | 27.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi17ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 27.4k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 27.4k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 27.4k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 27.4k | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi16ELb0EEEmPKh Line | Count | Source | 175 | 27.4k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 27.4k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 27.4k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 27.4k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 27.4k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 27.4k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 27.4k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 27.4k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 27.4k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 27.4k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 27.4k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 27.4k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 27.4k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 27.4k | constexpr bool READ_32_BITS = | 202 | 27.4k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 27.4k | if (READ_32_BITS) { | 205 | 27.4k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 27.4k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 27.4k | return word & mask; | 208 | 27.4k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 27.4k | } |
_ZN5doris11UnpackValueILi13ELi15ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 29.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.0k | return word & mask; | 208 | 29.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi14ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.0k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.0k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.0k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.0k | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi13ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 29.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.0k | return word & mask; | 208 | 29.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi12ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.0k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.0k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.0k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.0k | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi11ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 29.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.0k | return word & mask; | 208 | 29.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi10ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 29.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.0k | return word & mask; | 208 | 29.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi9ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.0k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.0k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.0k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.0k | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi8ELb0EEEmPKh Line | Count | Source | 175 | 29.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.0k | constexpr bool READ_32_BITS = | 202 | 29.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.0k | if (READ_32_BITS) { | 205 | 29.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.0k | return word & mask; | 208 | 29.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.0k | } |
_ZN5doris11UnpackValueILi13ELi7ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 42.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 42.2k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 42.2k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 42.2k | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi6ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 42.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 42.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 42.2k | return word & mask; | 208 | 42.2k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi5ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 42.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 42.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 42.2k | return word & mask; | 208 | 42.2k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi4ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 42.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 42.2k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 42.2k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 42.2k | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi3ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 42.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 42.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 42.2k | return word & mask; | 208 | 42.2k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi2ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 42.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 42.2k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 42.2k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 42.2k | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi1ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 42.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 42.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 42.2k | return word & mask; | 208 | 42.2k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi13ELi0ELb0EEEmPKh Line | Count | Source | 175 | 42.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 42.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 42.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 42.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 42.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 42.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 42.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 42.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 42.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 42.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 42.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 42.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 42.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 42.2k | constexpr bool READ_32_BITS = | 202 | 42.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 42.2k | if (READ_32_BITS) { | 205 | 42.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 42.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 42.2k | return word & mask; | 208 | 42.2k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 42.2k | } |
_ZN5doris11UnpackValueILi14ELi0ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi1ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi2ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi3ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi4ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi5ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi6ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi7ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi8ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi9ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi10ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi11ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi12ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi13ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi14ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi15ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi16ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi17ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.25M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.25M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.25M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.25M | return word & mask; | 220 | 2.25M | } |
_ZN5doris11UnpackValueILi14ELi18ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi19ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi20ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi21ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi22ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi23ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi24ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi25ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi26ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi27ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi28ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi29ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.24M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 2.24M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.24M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.24M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.24M | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi30ELb1EEEmPKh Line | Count | Source | 175 | 2.24M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.24M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.24M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.24M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.24M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.24M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.24M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.24M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.24M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.24M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.24M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.24M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.24M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.24M | constexpr bool READ_32_BITS = | 202 | 2.24M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 2.25M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.25M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.25M | return word & mask; | 208 | 2.25M | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 2.24M | } |
_ZN5doris11UnpackValueILi14ELi31ELb1EEEmPKh Line | Count | Source | 175 | 2.25M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 2.25M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 2.25M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 2.25M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 2.25M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 2.25M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 2.25M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 2.25M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 2.25M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 2.25M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 2.25M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 2.25M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 2.25M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 2.25M | constexpr bool READ_32_BITS = | 202 | 2.25M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 2.25M | if (READ_32_BITS) { | 205 | 2.25M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 2.25M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 2.25M | return word & mask; | 208 | 2.25M | } | 209 | | | 210 | 330 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 330 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 330 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 330 | return word & mask; | 220 | 2.25M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi14ELi23ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 158k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 158k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 158k | return word & mask; | 208 | 158k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi22ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 158k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 158k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 158k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 158k | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi21ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 158k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 158k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 158k | return word & mask; | 208 | 158k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi20ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 158k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 158k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 158k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 158k | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi19ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 158k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 158k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 158k | return word & mask; | 208 | 158k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi18ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 158k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 158k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 158k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 158k | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi17ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 158k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 158k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 158k | return word & mask; | 208 | 158k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi16ELb0EEEmPKh Line | Count | Source | 175 | 158k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 158k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 158k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 158k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 158k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 158k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 158k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 158k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 158k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 158k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 158k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 158k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 158k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 158k | constexpr bool READ_32_BITS = | 202 | 158k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 158k | if (READ_32_BITS) { | 205 | 158k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 158k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 158k | return word & mask; | 208 | 158k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 158k | } |
_ZN5doris11UnpackValueILi14ELi15ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 159k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 159k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 159k | return word & mask; | 208 | 159k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi14ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 159k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 159k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 159k | return word & mask; | 208 | 159k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi13ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 159k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 159k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 159k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 159k | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi12ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 159k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 159k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 159k | return word & mask; | 208 | 159k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi11ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 159k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 159k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 159k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 159k | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi10ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 159k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 159k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 159k | return word & mask; | 208 | 159k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi9ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 159k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 159k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 159k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 159k | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi8ELb0EEEmPKh Line | Count | Source | 175 | 159k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 159k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 159k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 159k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 159k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 159k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 159k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 159k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 159k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 159k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 159k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 159k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 159k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 159k | constexpr bool READ_32_BITS = | 202 | 159k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 159k | if (READ_32_BITS) { | 205 | 159k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 159k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 159k | return word & mask; | 208 | 159k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 159k | } |
_ZN5doris11UnpackValueILi14ELi7ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 162k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 162k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 162k | return word & mask; | 208 | 162k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi6ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 162k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 162k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 162k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 162k | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi5ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 162k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 162k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 162k | return word & mask; | 208 | 162k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi4ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 162k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 162k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 162k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 162k | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi3ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 162k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 162k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 162k | return word & mask; | 208 | 162k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi2ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 162k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 162k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 162k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 162k | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi1ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 162k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 162k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 162k | return word & mask; | 208 | 162k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi14ELi0ELb0EEEmPKh Line | Count | Source | 175 | 162k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 162k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 162k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 162k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 162k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 162k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 162k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 162k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 162k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 162k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 162k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 162k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 162k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 162k | constexpr bool READ_32_BITS = | 202 | 162k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 162k | if (READ_32_BITS) { | 205 | 162k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 162k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 162k | return word & mask; | 208 | 162k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 162k | } |
_ZN5doris11UnpackValueILi15ELi0ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi1ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi2ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi3ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi4ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi5ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi6ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi7ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi8ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi9ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi10ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi11ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi12ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi13ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi14ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi15ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi16ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi17ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi18ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi19ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi20ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi21ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi22ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi23ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi24ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi25ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi26ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi27ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi28ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi29ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 369k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 369k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 369k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 369k | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi30ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 369k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 369k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 369k | return word & mask; | 208 | 369k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 369k | } |
_ZN5doris11UnpackValueILi15ELi31ELb1EEEmPKh Line | Count | Source | 175 | 369k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 369k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 369k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 369k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 369k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 369k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 369k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 369k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 369k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 369k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 369k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 369k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 369k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 369k | constexpr bool READ_32_BITS = | 202 | 369k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 369k | if (READ_32_BITS) { | 205 | 369k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 369k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 369k | return word & mask; | 208 | 369k | } | 209 | | | 210 | 12 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 12 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 12 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 12 | return word & mask; | 220 | 369k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi15ELi23ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.5k | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi22ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 29.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.5k | return word & mask; | 208 | 29.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi21ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.5k | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi20ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 29.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.5k | return word & mask; | 208 | 29.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi19ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.5k | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi18ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 29.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.5k | return word & mask; | 208 | 29.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi17ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 29.5k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 29.5k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 29.5k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 29.5k | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi16ELb0EEEmPKh Line | Count | Source | 175 | 29.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 29.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 29.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 29.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 29.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 29.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 29.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 29.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 29.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 29.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 29.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 29.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 29.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 29.5k | constexpr bool READ_32_BITS = | 202 | 29.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 29.5k | if (READ_32_BITS) { | 205 | 29.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 29.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 29.5k | return word & mask; | 208 | 29.5k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 29.5k | } |
_ZN5doris11UnpackValueILi15ELi15ELb0EEEmPKh Line | Count | Source | 175 | 33.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.1k | constexpr bool READ_32_BITS = | 202 | 33.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.1k | if (READ_32_BITS) { | 205 | 33.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 33.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 33.1k | return word & mask; | 208 | 33.1k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 33.1k | } |
_ZN5doris11UnpackValueILi15ELi14ELb0EEEmPKh Line | Count | Source | 175 | 33.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.1k | constexpr bool READ_32_BITS = | 202 | 33.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 33.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 33.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 33.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 33.1k | return word & mask; | 220 | 33.1k | } |
_ZN5doris11UnpackValueILi15ELi13ELb0EEEmPKh Line | Count | Source | 175 | 33.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.1k | constexpr bool READ_32_BITS = | 202 | 33.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.1k | if (READ_32_BITS) { | 205 | 33.1k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 33.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 33.1k | return word & mask; | 208 | 33.1k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 33.1k | } |
_ZN5doris11UnpackValueILi15ELi12ELb0EEEmPKh Line | Count | Source | 175 | 33.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.1k | constexpr bool READ_32_BITS = | 202 | 33.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 33.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 33.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 33.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 33.1k | return word & mask; | 220 | 33.1k | } |
_ZN5doris11UnpackValueILi15ELi11ELb0EEEmPKh Line | Count | Source | 175 | 33.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.0k | constexpr bool READ_32_BITS = | 202 | 33.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.0k | if (READ_32_BITS) { | 205 | 33.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 33.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 33.0k | return word & mask; | 208 | 33.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 33.0k | } |
_ZN5doris11UnpackValueILi15ELi10ELb0EEEmPKh Line | Count | Source | 175 | 33.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.1k | constexpr bool READ_32_BITS = | 202 | 33.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.1k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 33.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 33.1k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 33.1k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 33.1k | return word & mask; | 220 | 33.1k | } |
_ZN5doris11UnpackValueILi15ELi9ELb0EEEmPKh Line | Count | Source | 175 | 33.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.1k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.1k | constexpr bool READ_32_BITS = | 202 | 33.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.1k | if (READ_32_BITS) { | 205 | 33.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 33.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 33.0k | return word & mask; | 208 | 33.0k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 33.1k | } |
_ZN5doris11UnpackValueILi15ELi8ELb0EEEmPKh Line | Count | Source | 175 | 33.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 33.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 33.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 33.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 33.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 33.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 33.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 33.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 33.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 33.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 33.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 33.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 33.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 33.0k | constexpr bool READ_32_BITS = | 202 | 33.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 33.0k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 33.0k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 33.0k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 33.0k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 33.0k | return word & mask; | 220 | 33.0k | } |
_ZN5doris11UnpackValueILi15ELi7ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 37.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.6k | return word & mask; | 208 | 37.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi6ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.6k | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi5ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 37.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.6k | return word & mask; | 208 | 37.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi4ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.6k | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi3ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 37.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.6k | return word & mask; | 208 | 37.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi2ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 37.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 37.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 37.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 37.6k | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi1ELb0EEEmPKh Line | Count | Source | 175 | 37.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.6k | constexpr bool READ_32_BITS = | 202 | 37.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.6k | if (READ_32_BITS) { | 205 | 37.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.6k | return word & mask; | 208 | 37.6k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 37.6k | } |
_ZN5doris11UnpackValueILi15ELi0ELb0EEEmPKh Line | Count | Source | 175 | 37.5k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 37.5k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 37.5k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 37.5k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 37.5k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 37.5k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 37.5k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 37.5k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 37.5k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 37.5k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 37.5k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 37.5k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 37.5k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 37.5k | constexpr bool READ_32_BITS = | 202 | 37.5k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 37.5k | if (READ_32_BITS) { | 205 | 37.5k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 37.5k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 37.5k | return word & mask; | 208 | 37.5k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 37.5k | } |
_ZN5doris11UnpackValueILi16ELi0ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 126 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 126 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 126 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 126 | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi1ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 106 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 106 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 106 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 106 | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi2ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 398 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 398 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 398 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 398 | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi3ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 310 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 310 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 310 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 310 | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi4ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 659k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 659k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 659k | return word & mask; | 208 | 659k | } | 209 | | | 210 | 174 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 174 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 174 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 174 | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi5ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 659k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 659k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 659k | return word & mask; | 208 | 659k | } | 209 | | | 210 | 706 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 706 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 706 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 706 | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi6ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 669k | if (READ_32_BITS) { | 205 | 669k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 669k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 669k | return word & mask; | 208 | 669k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 660k | } |
_ZN5doris11UnpackValueILi16ELi7ELb1EEEmPKh Line | Count | Source | 175 | 673k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 673k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 673k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 673k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 673k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 673k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 673k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 673k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 673k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 673k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 673k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 673k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 673k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 673k | constexpr bool READ_32_BITS = | 202 | 673k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 673k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 5.60k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 5.60k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 5.60k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 5.60k | return word & mask; | 220 | 673k | } |
_ZN5doris11UnpackValueILi16ELi8ELb1EEEmPKh Line | Count | Source | 175 | 673k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 673k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 673k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 673k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 673k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 673k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 673k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 673k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 673k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 673k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 673k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 673k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 673k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 673k | constexpr bool READ_32_BITS = | 202 | 673k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 673k | if (READ_32_BITS) { | 205 | 671k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 671k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 671k | return word & mask; | 208 | 671k | } | 209 | | | 210 | 2.69k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.69k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.69k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.69k | return word & mask; | 220 | 673k | } |
_ZN5doris11UnpackValueILi16ELi9ELb1EEEmPKh Line | Count | Source | 175 | 672k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 672k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 672k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 672k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 672k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 672k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 672k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 672k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 672k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 672k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 672k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 672k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 672k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 672k | constexpr bool READ_32_BITS = | 202 | 672k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 672k | if (READ_32_BITS) { | 205 | 670k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 670k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 670k | return word & mask; | 208 | 670k | } | 209 | | | 210 | 2.52k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.52k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.52k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.52k | return word & mask; | 220 | 672k | } |
_ZN5doris11UnpackValueILi16ELi10ELb1EEEmPKh Line | Count | Source | 175 | 672k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 672k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 672k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 672k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 672k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 672k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 672k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 672k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 672k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 672k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 672k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 672k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 672k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 672k | constexpr bool READ_32_BITS = | 202 | 672k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 672k | if (READ_32_BITS) { | 205 | 669k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 669k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 669k | return word & mask; | 208 | 669k | } | 209 | | | 210 | 3.31k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.31k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.31k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.31k | return word & mask; | 220 | 672k | } |
_ZN5doris11UnpackValueILi16ELi11ELb1EEEmPKh Line | Count | Source | 175 | 672k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 672k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 672k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 672k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 672k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 672k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 672k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 672k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 672k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 672k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 672k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 672k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 672k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 672k | constexpr bool READ_32_BITS = | 202 | 672k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 672k | if (READ_32_BITS) { | 205 | 668k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 668k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 668k | return word & mask; | 208 | 668k | } | 209 | | | 210 | 3.42k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.42k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.42k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.42k | return word & mask; | 220 | 672k | } |
_ZN5doris11UnpackValueILi16ELi12ELb1EEEmPKh Line | Count | Source | 175 | 671k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 671k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 671k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 671k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 671k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 671k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 671k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 671k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 671k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 671k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 671k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 671k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 671k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 671k | constexpr bool READ_32_BITS = | 202 | 671k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 671k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 3.60k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.60k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.60k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.60k | return word & mask; | 220 | 671k | } |
_ZN5doris11UnpackValueILi16ELi13ELb1EEEmPKh Line | Count | Source | 175 | 671k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 671k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 671k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 671k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 671k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 671k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 671k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 671k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 671k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 671k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 671k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 671k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 671k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 671k | constexpr bool READ_32_BITS = | 202 | 671k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 671k | if (READ_32_BITS) { | 205 | 668k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 668k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 668k | return word & mask; | 208 | 668k | } | 209 | | | 210 | 3.03k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.03k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.03k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.03k | return word & mask; | 220 | 671k | } |
_ZN5doris11UnpackValueILi16ELi14ELb1EEEmPKh Line | Count | Source | 175 | 670k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 670k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 670k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 670k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 670k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 670k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 670k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 670k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 670k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 670k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 670k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 670k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 670k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 670k | constexpr bool READ_32_BITS = | 202 | 670k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 670k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 2.29k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.29k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.29k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.29k | return word & mask; | 220 | 670k | } |
_ZN5doris11UnpackValueILi16ELi15ELb1EEEmPKh Line | Count | Source | 175 | 670k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 670k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 670k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 670k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 670k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 670k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 670k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 670k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 670k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 670k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 670k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 670k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 670k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 670k | constexpr bool READ_32_BITS = | 202 | 670k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 670k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 2.27k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.27k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.27k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.27k | return word & mask; | 220 | 670k | } |
_ZN5doris11UnpackValueILi16ELi16ELb1EEEmPKh Line | Count | Source | 175 | 667k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 667k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 667k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 667k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 667k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 667k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 667k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 667k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 667k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 667k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 667k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 667k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 667k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 667k | constexpr bool READ_32_BITS = | 202 | 667k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 667k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 298 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 298 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 298 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 298 | return word & mask; | 220 | 667k | } |
_ZN5doris11UnpackValueILi16ELi17ELb1EEEmPKh Line | Count | Source | 175 | 667k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 667k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 667k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 667k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 667k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 667k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 667k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 667k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 667k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 667k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 667k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 667k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 667k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 667k | constexpr bool READ_32_BITS = | 202 | 667k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 667k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 72 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 72 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 72 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 72 | return word & mask; | 220 | 667k | } |
_ZN5doris11UnpackValueILi16ELi18ELb1EEEmPKh Line | Count | Source | 175 | 667k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 667k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 667k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 667k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 667k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 667k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 667k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 667k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 667k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 667k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 667k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 667k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 667k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 667k | constexpr bool READ_32_BITS = | 202 | 667k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 667k | if (READ_32_BITS) { | 205 | 666k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 666k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 666k | return word & mask; | 208 | 666k | } | 209 | | | 210 | 856 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 856 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 856 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 856 | return word & mask; | 220 | 667k | } |
_ZN5doris11UnpackValueILi16ELi19ELb1EEEmPKh Line | Count | Source | 175 | 667k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 667k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 667k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 667k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 667k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 667k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 667k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 667k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 667k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 667k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 667k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 667k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 667k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 667k | constexpr bool READ_32_BITS = | 202 | 667k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 667k | if (READ_32_BITS) { | 205 | 667k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 667k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 667k | return word & mask; | 208 | 667k | } | 209 | | | 210 | 698 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 698 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 698 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 698 | return word & mask; | 220 | 667k | } |
_ZN5doris11UnpackValueILi16ELi20ELb1EEEmPKh Line | Count | Source | 175 | 667k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 667k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 667k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 667k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 667k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 667k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 667k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 667k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 667k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 667k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 667k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 667k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 667k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 667k | constexpr bool READ_32_BITS = | 202 | 667k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 667k | if (READ_32_BITS) { | 205 | 666k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 666k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 666k | return word & mask; | 208 | 666k | } | 209 | | | 210 | 1.16k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.16k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.16k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.16k | return word & mask; | 220 | 667k | } |
_ZN5doris11UnpackValueILi16ELi21ELb1EEEmPKh Line | Count | Source | 175 | 667k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 667k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 667k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 667k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 667k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 667k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 667k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 667k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 667k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 667k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 667k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 667k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 667k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 667k | constexpr bool READ_32_BITS = | 202 | 667k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 667k | if (READ_32_BITS) { | 205 | 665k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 665k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 665k | return word & mask; | 208 | 665k | } | 209 | | | 210 | 1.54k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.54k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.54k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.54k | return word & mask; | 220 | 667k | } |
_ZN5doris11UnpackValueILi16ELi22ELb1EEEmPKh Line | Count | Source | 175 | 666k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 666k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 666k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 666k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 666k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 666k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 666k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 666k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 666k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 666k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 666k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 666k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 666k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 666k | constexpr bool READ_32_BITS = | 202 | 666k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 666k | if (READ_32_BITS) { | 205 | 665k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 665k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 665k | return word & mask; | 208 | 665k | } | 209 | | | 210 | 1.07k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.07k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.07k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.07k | return word & mask; | 220 | 666k | } |
_ZN5doris11UnpackValueILi16ELi23ELb1EEEmPKh Line | Count | Source | 175 | 665k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 665k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 665k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 665k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 665k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 665k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 665k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 665k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 665k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 665k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 665k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 665k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 665k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 665k | constexpr bool READ_32_BITS = | 202 | 665k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 665k | if (READ_32_BITS) { | 205 | 664k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 664k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 664k | return word & mask; | 208 | 664k | } | 209 | | | 210 | 1.03k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.03k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.03k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.03k | return word & mask; | 220 | 665k | } |
_ZN5doris11UnpackValueILi16ELi24ELb1EEEmPKh Line | Count | Source | 175 | 665k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 665k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 665k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 665k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 665k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 665k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 665k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 665k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 665k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 665k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 665k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 665k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 665k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 665k | constexpr bool READ_32_BITS = | 202 | 665k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 665k | if (READ_32_BITS) { | 205 | 662k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 662k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 662k | return word & mask; | 208 | 662k | } | 209 | | | 210 | 2.78k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.78k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.78k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.78k | return word & mask; | 220 | 665k | } |
_ZN5doris11UnpackValueILi16ELi25ELb1EEEmPKh Line | Count | Source | 175 | 664k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 664k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 664k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 664k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 664k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 664k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 664k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 664k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 664k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 664k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 664k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 664k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 664k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 664k | constexpr bool READ_32_BITS = | 202 | 664k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 664k | if (READ_32_BITS) { | 205 | 661k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 661k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 661k | return word & mask; | 208 | 661k | } | 209 | | | 210 | 2.84k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.84k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.84k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.84k | return word & mask; | 220 | 664k | } |
_ZN5doris11UnpackValueILi16ELi26ELb1EEEmPKh Line | Count | Source | 175 | 664k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 664k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 664k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 664k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 664k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 664k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 664k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 664k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 664k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 664k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 664k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 664k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 664k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 664k | constexpr bool READ_32_BITS = | 202 | 664k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 664k | if (READ_32_BITS) { | 205 | 661k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 661k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 661k | return word & mask; | 208 | 661k | } | 209 | | | 210 | 2.91k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.91k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.91k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.91k | return word & mask; | 220 | 664k | } |
_ZN5doris11UnpackValueILi16ELi27ELb1EEEmPKh Line | Count | Source | 175 | 663k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 663k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 663k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 663k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 663k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 663k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 663k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 663k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 663k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 663k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 663k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 663k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 663k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 663k | constexpr bool READ_32_BITS = | 202 | 663k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 663k | if (READ_32_BITS) { | 205 | 659k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 659k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 659k | return word & mask; | 208 | 659k | } | 209 | | | 210 | 3.71k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.71k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.71k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.71k | return word & mask; | 220 | 663k | } |
_ZN5doris11UnpackValueILi16ELi28ELb1EEEmPKh Line | Count | Source | 175 | 663k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 663k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 663k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 663k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 663k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 663k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 663k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 663k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 663k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 663k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 663k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 663k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 663k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 663k | constexpr bool READ_32_BITS = | 202 | 663k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 663k | if (READ_32_BITS) { | 205 | 659k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 659k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 659k | return word & mask; | 208 | 659k | } | 209 | | | 210 | 3.41k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 3.41k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 3.41k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 3.41k | return word & mask; | 220 | 663k | } |
_ZN5doris11UnpackValueILi16ELi29ELb1EEEmPKh Line | Count | Source | 175 | 662k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 662k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 662k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 662k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 662k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 662k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 662k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 662k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 662k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 662k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 662k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 662k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 662k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 662k | constexpr bool READ_32_BITS = | 202 | 662k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 662k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 1.97k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.97k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.97k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.97k | return word & mask; | 220 | 662k | } |
_ZN5doris11UnpackValueILi16ELi30ELb1EEEmPKh Line | Count | Source | 175 | 662k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 662k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 662k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 662k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 662k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 662k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 662k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 662k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 662k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 662k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 662k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 662k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 662k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 662k | constexpr bool READ_32_BITS = | 202 | 662k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 662k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 2.16k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2.16k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2.16k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2.16k | return word & mask; | 220 | 662k | } |
_ZN5doris11UnpackValueILi16ELi31ELb1EEEmPKh Line | Count | Source | 175 | 660k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 660k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 660k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 660k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 660k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 660k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 660k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 660k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 660k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 660k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 660k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 660k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 660k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 660k | constexpr bool READ_32_BITS = | 202 | 660k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 660k | if (READ_32_BITS) { | 205 | 660k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 660k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 660k | return word & mask; | 208 | 660k | } | 209 | | | 210 | 124 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 124 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 124 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 124 | return word & mask; | 220 | 660k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi16ELi23ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi22ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi21ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi20ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi19ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi18ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi17ELb0EEEmPKh Line | Count | Source | 175 | 46.8k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.8k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.8k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.8k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.8k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.8k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.8k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.8k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.8k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.8k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.8k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.8k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.8k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.8k | constexpr bool READ_32_BITS = | 202 | 46.8k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.8k | if (READ_32_BITS) { | 205 | 46.8k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.8k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.8k | return word & mask; | 208 | 46.8k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 46.8k | } |
_ZN5doris11UnpackValueILi16ELi16ELb0EEEmPKh Line | Count | Source | 175 | 46.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 46.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 46.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 46.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 46.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 46.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 46.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 46.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 46.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 46.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 46.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 46.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 46.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 46.9k | constexpr bool READ_32_BITS = | 202 | 46.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 46.9k | if (READ_32_BITS) { | 205 | 46.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 46.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 46.9k | return word & mask; | 208 | 46.9k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 46.9k | } |
_ZN5doris11UnpackValueILi16ELi15ELb0EEEmPKh Line | Count | Source | 175 | 48.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.2k | constexpr bool READ_32_BITS = | 202 | 48.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.2k | if (READ_32_BITS) { | 205 | 48.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.2k | return word & mask; | 208 | 48.2k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 48.2k | } |
_ZN5doris11UnpackValueILi16ELi14ELb0EEEmPKh Line | Count | Source | 175 | 48.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.2k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.2k | constexpr bool READ_32_BITS = | 202 | 48.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.2k | if (READ_32_BITS) { | 205 | 48.2k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.2k | return word & mask; | 208 | 48.2k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 48.2k | } |
_ZN5doris11UnpackValueILi16ELi13ELb0EEEmPKh Line | Count | Source | 175 | 48.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.3k | constexpr bool READ_32_BITS = | 202 | 48.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.3k | if (READ_32_BITS) { | 205 | 48.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.3k | return word & mask; | 208 | 48.3k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 48.3k | } |
_ZN5doris11UnpackValueILi16ELi12ELb0EEEmPKh Line | Count | Source | 175 | 48.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.3k | constexpr bool READ_32_BITS = | 202 | 48.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.3k | if (READ_32_BITS) { | 205 | 48.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.3k | return word & mask; | 208 | 48.3k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 48.3k | } |
_ZN5doris11UnpackValueILi16ELi11ELb0EEEmPKh Line | Count | Source | 175 | 48.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.3k | constexpr bool READ_32_BITS = | 202 | 48.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.3k | if (READ_32_BITS) { | 205 | 48.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.3k | return word & mask; | 208 | 48.3k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 48.3k | } |
_ZN5doris11UnpackValueILi16ELi10ELb0EEEmPKh Line | Count | Source | 175 | 48.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.3k | constexpr bool READ_32_BITS = | 202 | 48.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.3k | if (READ_32_BITS) { | 205 | 48.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.3k | return word & mask; | 208 | 48.3k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 48.3k | } |
_ZN5doris11UnpackValueILi16ELi9ELb0EEEmPKh Line | Count | Source | 175 | 48.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.3k | constexpr bool READ_32_BITS = | 202 | 48.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.3k | if (READ_32_BITS) { | 205 | 48.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.3k | return word & mask; | 208 | 48.3k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 48.3k | } |
_ZN5doris11UnpackValueILi16ELi8ELb0EEEmPKh Line | Count | Source | 175 | 48.3k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 48.3k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 48.3k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 48.3k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 48.3k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 48.3k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 48.3k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 48.3k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 48.3k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 48.3k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 48.3k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 48.3k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 48.3k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 48.3k | constexpr bool READ_32_BITS = | 202 | 48.3k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 48.3k | if (READ_32_BITS) { | 205 | 48.3k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 48.3k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 48.3k | return word & mask; | 208 | 48.3k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 48.3k | } |
_ZN5doris11UnpackValueILi16ELi7ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi6ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi5ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi4ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi3ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi2ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi1ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi16ELi0ELb0EEEmPKh Line | Count | Source | 175 | 50.0k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 50.0k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 50.0k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 50.0k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 50.0k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 50.0k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 50.0k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 50.0k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 50.0k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 50.0k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 50.0k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 50.0k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 50.0k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 50.0k | constexpr bool READ_32_BITS = | 202 | 50.0k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 50.0k | if (READ_32_BITS) { | 205 | 50.0k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 50.0k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 50.0k | return word & mask; | 208 | 50.0k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 50.0k | } |
_ZN5doris11UnpackValueILi17ELi0ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi1ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi2ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi3ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi4ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi5ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi6ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi7ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi8ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi9ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi10ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi11ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi12ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi13ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi14ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi15ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi16ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi17ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi18ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi19ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi20ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi21ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi22ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi23ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi24ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi25ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi26ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi27ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi28ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi29ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi30ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 1.21M | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 1.21M | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 1.21M | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 1.21M | return word & mask; | 220 | 1.21M | } |
_ZN5doris11UnpackValueILi17ELi31ELb1EEEmPKh Line | Count | Source | 175 | 1.21M | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 1.21M | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 1.21M | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 1.21M | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 1.21M | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 1.21M | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 1.21M | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 1.21M | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 1.21M | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 1.21M | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 1.21M | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 1.21M | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 1.21M | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 1.21M | constexpr bool READ_32_BITS = | 202 | 1.21M | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 1.21M | if (READ_32_BITS) { | 205 | 1.21M | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 1.21M | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 1.21M | return word & mask; | 208 | 1.21M | } | 209 | | | 210 | 554 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 554 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 554 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 554 | return word & mask; | 220 | 1.21M | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi17ELi23ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi22ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi21ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi20ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi19ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi18ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi17ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi16ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi15ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi14ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 6 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 6 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 6 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 6 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi13ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi12ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi11ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi10ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi9ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi8ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi7ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi6ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 18.4E | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi5ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi4ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi3ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi2ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 8 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 8 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 8 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 8 | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi1ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 82.9k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 82.9k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 82.9k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 82.9k | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi17ELi0ELb0EEEmPKh Line | Count | Source | 175 | 82.9k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 82.9k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 82.9k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 82.9k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 82.9k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 82.9k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 82.9k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 82.9k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 82.9k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 82.9k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 82.9k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 82.9k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 82.9k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 82.9k | constexpr bool READ_32_BITS = | 202 | 82.9k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 82.9k | if (READ_32_BITS) { | 205 | 82.9k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 82.9k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 82.9k | return word & mask; | 208 | 82.9k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 82.9k | } |
_ZN5doris11UnpackValueILi18ELi0ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi1ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi2ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi3ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi4ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi5ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi6ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi7ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi8ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi9ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi10ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi11ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi12ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi13ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi14ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi15ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi16ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi17ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi18ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi19ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi20ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi21ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi22ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi23ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi24ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi25ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi26ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi27ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi28ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi29ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi30ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 381k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 381k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 381k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 381k | return word & mask; | 220 | 381k | } |
_ZN5doris11UnpackValueILi18ELi31ELb1EEEmPKh Line | Count | Source | 175 | 381k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 381k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 381k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 381k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 381k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 381k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 381k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 381k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 381k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 381k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 381k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 381k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 381k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 381k | constexpr bool READ_32_BITS = | 202 | 381k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 381k | if (READ_32_BITS) { | 205 | 381k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 381k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 381k | return word & mask; | 208 | 381k | } | 209 | | | 210 | 18.4E | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 18.4E | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 18.4E | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 18.4E | return word & mask; | 220 | 381k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi18ELi23ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi22ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 4 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 4 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 4 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 4 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi21ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi20ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi19ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi18ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 2 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 2 | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 2 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 2 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi17ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi16ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi15ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi14ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi13ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi12ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi11ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi10ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi9ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 25.6k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.6k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.6k | return word & mask; | 208 | 25.6k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi8ELb0EEEmPKh Line | Count | Source | 175 | 25.6k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.6k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.6k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.6k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.6k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.6k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.6k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.6k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.6k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.6k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.6k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.6k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.6k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.6k | constexpr bool READ_32_BITS = | 202 | 25.6k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.6k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.6k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.6k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.6k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.6k | return word & mask; | 220 | 25.6k | } |
_ZN5doris11UnpackValueILi18ELi7ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.7k | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi6ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 25.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.7k | return word & mask; | 208 | 25.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi5ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.7k | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi4ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 25.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.7k | return word & mask; | 208 | 25.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi3ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.7k | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi2ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 25.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.7k | return word & mask; | 208 | 25.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi1ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 0 | return word & mask; | 208 | 0 | } | 209 | | | 210 | 25.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 25.7k | word >>= FIRST_BIT_OFFSET; | 212 | | | 213 | 25.7k | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | | | 219 | 25.7k | return word & mask; | 220 | 25.7k | } |
_ZN5doris11UnpackValueILi18ELi0ELb0EEEmPKh Line | Count | Source | 175 | 25.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 176 | 25.7k | if (BIT_WIDTH == 0) return 0; | 177 | | | 178 | 25.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 179 | 25.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 180 | 25.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 181 | 25.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 182 | 25.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 183 | 25.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 184 | | | 185 | 25.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 186 | 25.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 187 | 25.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 188 | | | 189 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 190 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 191 | | // enough space in the buffer from the current reading point. | 192 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 193 | | // is faster. | 194 | 25.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 195 | 25.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 196 | | | 197 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 198 | | // necessary) because performance benchmarks show that it is better this way. This seems | 199 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 200 | | // compiler version. | 201 | 25.7k | constexpr bool READ_32_BITS = | 202 | 25.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 203 | | | 204 | 25.7k | if (READ_32_BITS) { | 205 | 25.7k | uint32_t word = in[FIRST_WORD_IDX]; | 206 | 25.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 207 | 25.7k | return word & mask; | 208 | 25.7k | } | 209 | | | 210 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 211 | 0 | word >>= FIRST_BIT_OFFSET; | 212 | |
| 213 | 0 | if (WORDS_TO_READ > 2) { | 214 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 215 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 216 | 0 | word |= extra_word << USEFUL_BITS; | 217 | 0 | } | 218 | |
| 219 | 0 | return word & mask; | 220 | 25.7k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb0EEEmPKh |