/var/local/thirdparty/installed/include/roaring/bitset_util.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef BITSET_UTIL_H |
2 | | #define BITSET_UTIL_H |
3 | | |
4 | | #include <stdint.h> |
5 | | |
6 | | #include <roaring/portability.h> |
7 | | #include <roaring/utilasm.h> |
8 | | |
9 | | #if CROARING_IS_X64 |
10 | | #ifndef CROARING_COMPILER_SUPPORTS_AVX512 |
11 | | #error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." |
12 | | #endif // CROARING_COMPILER_SUPPORTS_AVX512 |
13 | | #endif |
14 | | |
15 | | #ifdef __cplusplus |
16 | | extern "C" { namespace roaring { namespace internal { |
17 | | #endif |
18 | | |
19 | | /* |
20 | | * Set all bits in indexes [begin,end) to true. |
21 | | */ |
22 | | static inline void bitset_set_range(uint64_t *words, uint32_t start, |
23 | 0 | uint32_t end) { |
24 | 0 | if (start == end) return; |
25 | 0 | uint32_t firstword = start / 64; |
26 | 0 | uint32_t endword = (end - 1) / 64; |
27 | 0 | if (firstword == endword) { |
28 | 0 | words[firstword] |= ((~UINT64_C(0)) << (start % 64)) & |
29 | 0 | ((~UINT64_C(0)) >> ((~end + 1) % 64)); |
30 | 0 | return; |
31 | 0 | } |
32 | 0 | words[firstword] |= (~UINT64_C(0)) << (start % 64); |
33 | 0 | for (uint32_t i = firstword + 1; i < endword; i++) { |
34 | 0 | words[i] = ~UINT64_C(0); |
35 | 0 | } |
36 | 0 | words[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64); |
37 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL16bitset_set_rangeEPmjj |
38 | | |
39 | | |
40 | | /* |
41 | | * Find the cardinality of the bitset in [begin,begin+lenminusone] |
42 | | */ |
43 | | static inline int bitset_lenrange_cardinality(const uint64_t *words, |
44 | | uint32_t start, |
45 | 0 | uint32_t lenminusone) { |
46 | 0 | uint32_t firstword = start / 64; |
47 | 0 | uint32_t endword = (start + lenminusone) / 64; |
48 | 0 | if (firstword == endword) { |
49 | 0 | return roaring_hamming(words[firstword] & |
50 | 0 | ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) |
51 | 0 | << (start % 64)); |
52 | 0 | } |
53 | 0 | int answer = roaring_hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64))); |
54 | 0 | for (uint32_t i = firstword + 1; i < endword; i++) { |
55 | 0 | answer += roaring_hamming(words[i]); |
56 | 0 | } |
57 | 0 | answer += |
58 | 0 | roaring_hamming(words[endword] & |
59 | 0 | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)); |
60 | 0 | return answer; |
61 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL27bitset_lenrange_cardinalityEPKmjj |
62 | | |
63 | | /* |
64 | | * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0 |
65 | | */ |
66 | | static inline bool bitset_lenrange_empty(const uint64_t *words, uint32_t start, |
67 | 0 | uint32_t lenminusone) { |
68 | 0 | uint32_t firstword = start / 64; |
69 | 0 | uint32_t endword = (start + lenminusone) / 64; |
70 | 0 | if (firstword == endword) { |
71 | 0 | return (words[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) |
72 | 0 | << (start % 64)) == 0; |
73 | 0 | } |
74 | 0 | if (((words[firstword] & ((~UINT64_C(0)) << (start%64)))) != 0) { |
75 | 0 | return false; |
76 | 0 | } |
77 | 0 | for (uint32_t i = firstword + 1; i < endword; i++) { |
78 | 0 | if (words[i] != 0) { |
79 | 0 | return false; |
80 | 0 | } |
81 | 0 | } |
82 | 0 | if ((words[endword] & (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) { |
83 | 0 | return false; |
84 | 0 | } |
85 | 0 | return true; |
86 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL21bitset_lenrange_emptyEPKmjj |
87 | | |
88 | | |
89 | | /* |
90 | | * Set all bits in indexes [begin,begin+lenminusone] to true. |
91 | | */ |
92 | | static inline void bitset_set_lenrange(uint64_t *words, uint32_t start, |
93 | 0 | uint32_t lenminusone) { |
94 | 0 | uint32_t firstword = start / 64; |
95 | 0 | uint32_t endword = (start + lenminusone) / 64; |
96 | 0 | if (firstword == endword) { |
97 | 0 | words[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) |
98 | 0 | << (start % 64); |
99 | 0 | return; |
100 | 0 | } |
101 | 0 | uint64_t temp = words[endword]; |
102 | 0 | words[firstword] |= (~UINT64_C(0)) << (start % 64); |
103 | 0 | for (uint32_t i = firstword + 1; i < endword; i += 2) |
104 | 0 | words[i] = words[i + 1] = ~UINT64_C(0); |
105 | 0 | words[endword] = |
106 | 0 | temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64); |
107 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL19bitset_set_lenrangeEPmjj |
108 | | |
109 | | /* |
110 | | * Flip all the bits in indexes [begin,end). |
111 | | */ |
112 | | static inline void bitset_flip_range(uint64_t *words, uint32_t start, |
113 | 0 | uint32_t end) { |
114 | 0 | if (start == end) return; |
115 | 0 | uint32_t firstword = start / 64; |
116 | 0 | uint32_t endword = (end - 1) / 64; |
117 | 0 | words[firstword] ^= ~((~UINT64_C(0)) << (start % 64)); |
118 | 0 | for (uint32_t i = firstword; i < endword; i++) { |
119 | 0 | words[i] = ~words[i]; |
120 | 0 | } |
121 | 0 | words[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64)); |
122 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL17bitset_flip_rangeEPmjj |
123 | | |
124 | | /* |
125 | | * Set all bits in indexes [begin,end) to false. |
126 | | */ |
127 | | static inline void bitset_reset_range(uint64_t *words, uint32_t start, |
128 | 0 | uint32_t end) { |
129 | 0 | if (start == end) return; |
130 | 0 | uint32_t firstword = start / 64; |
131 | 0 | uint32_t endword = (end - 1) / 64; |
132 | 0 | if (firstword == endword) { |
133 | 0 | words[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) & |
134 | 0 | ((~UINT64_C(0)) >> ((~end + 1) % 64))); |
135 | 0 | return; |
136 | 0 | } |
137 | 0 | words[firstword] &= ~((~UINT64_C(0)) << (start % 64)); |
138 | 0 | for (uint32_t i = firstword + 1; i < endword; i++) { |
139 | 0 | words[i] = UINT64_C(0); |
140 | 0 | } |
141 | 0 | words[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64)); |
142 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL18bitset_reset_rangeEPmjj |
143 | | |
144 | | /* |
145 | | * Given a bitset containing "length" 64-bit words, write out the position |
146 | | * of all the set bits to "out", values start at "base". |
147 | | * |
148 | | * The "out" pointer should be sufficient to store the actual number of bits |
149 | | * set. |
150 | | * |
151 | | * Returns how many values were actually decoded. |
152 | | * |
153 | | * This function should only be expected to be faster than |
154 | | * bitset_extract_setbits |
155 | | * when the density of the bitset is high. |
156 | | * |
157 | | * This function uses AVX2 decoding. |
158 | | */ |
159 | | size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length, |
160 | | uint32_t *out, size_t outcapacity, |
161 | | uint32_t base); |
162 | | |
163 | | size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, |
164 | | uint32_t *out, size_t outcapacity, |
165 | | uint32_t base); |
166 | | /* |
167 | | * Given a bitset containing "length" 64-bit words, write out the position |
168 | | * of all the set bits to "out", values start at "base". |
169 | | * |
170 | | * The "out" pointer should be sufficient to store the actual number of bits |
171 | | *set. |
172 | | * |
173 | | * Returns how many values were actually decoded. |
174 | | */ |
175 | | size_t bitset_extract_setbits(const uint64_t *words, size_t length, |
176 | | uint32_t *out, uint32_t base); |
177 | | |
178 | | /* |
179 | | * Given a bitset containing "length" 64-bit words, write out the position |
180 | | * of all the set bits to "out" as 16-bit integers, values start at "base" (can |
181 | | *be set to zero) |
182 | | * |
183 | | * The "out" pointer should be sufficient to store the actual number of bits |
184 | | *set. |
185 | | * |
186 | | * Returns how many values were actually decoded. |
187 | | * |
188 | | * This function should only be expected to be faster than |
189 | | *bitset_extract_setbits_uint16 |
190 | | * when the density of the bitset is high. |
191 | | * |
192 | | * This function uses SSE decoding. |
193 | | */ |
194 | | size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length, |
195 | | uint16_t *out, size_t outcapacity, |
196 | | uint16_t base); |
197 | | |
198 | | size_t bitset_extract_setbits_avx512_uint16(const uint64_t *words, size_t length, |
199 | | uint16_t *out, size_t outcapacity, |
200 | | uint16_t base); |
201 | | |
202 | | /* |
203 | | * Given a bitset containing "length" 64-bit words, write out the position |
204 | | * of all the set bits to "out", values start at "base" |
205 | | * (can be set to zero) |
206 | | * |
207 | | * The "out" pointer should be sufficient to store the actual number of bits |
208 | | *set. |
209 | | * |
210 | | * Returns how many values were actually decoded. |
211 | | */ |
212 | | size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length, |
213 | | uint16_t *out, uint16_t base); |
214 | | |
215 | | /* |
216 | | * Given two bitsets containing "length" 64-bit words, write out the position |
217 | | * of all the common set bits to "out", values start at "base" |
218 | | * (can be set to zero) |
219 | | * |
220 | | * The "out" pointer should be sufficient to store the actual number of bits |
221 | | * set. |
222 | | * |
223 | | * Returns how many values were actually decoded. |
224 | | */ |
225 | | size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ words1, |
226 | | const uint64_t * __restrict__ words2, |
227 | | size_t length, uint16_t *out, |
228 | | uint16_t base); |
229 | | |
230 | | /* |
231 | | * Given a bitset having cardinality card, set all bit values in the list (there |
232 | | * are length of them) |
233 | | * and return the updated cardinality. This evidently assumes that the bitset |
234 | | * already contained data. |
235 | | */ |
236 | | uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card, |
237 | | const uint16_t *list, uint64_t length); |
238 | | /* |
239 | | * Given a bitset, set all bit values in the list (there |
240 | | * are length of them). |
241 | | */ |
242 | | void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length); |
243 | | |
244 | | /* |
245 | | * Given a bitset having cardinality card, unset all bit values in the list |
246 | | * (there are length of them) |
247 | | * and return the updated cardinality. This evidently assumes that the bitset |
248 | | * already contained data. |
249 | | */ |
250 | | uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, |
251 | | uint64_t length); |
252 | | |
253 | | /* |
254 | | * Given a bitset having cardinality card, toggle all bit values in the list |
255 | | * (there are length of them) |
256 | | * and return the updated cardinality. This evidently assumes that the bitset |
257 | | * already contained data. |
258 | | */ |
259 | | |
260 | | uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card, |
261 | | const uint16_t *list, uint64_t length); |
262 | | |
263 | | void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length); |
264 | | |
265 | | #if CROARING_IS_X64 |
266 | | /*** |
267 | | * BEGIN Harley-Seal popcount functions. |
268 | | */ |
269 | | CROARING_TARGET_AVX2 |
270 | | /** |
271 | | * Compute the population count of a 256-bit word |
272 | | * This is not especially fast, but it is convenient as part of other functions. |
273 | | */ |
274 | 0 | static inline __m256i popcount256(__m256i v) { |
275 | 0 | const __m256i lookuppos = _mm256_setr_epi8( |
276 | 0 | /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2, |
277 | 0 | /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3, |
278 | 0 | /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3, |
279 | 0 | /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4, |
280 | 0 |
|
281 | 0 | /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2, |
282 | 0 | /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3, |
283 | 0 | /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3, |
284 | 0 | /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4); |
285 | 0 | const __m256i lookupneg = _mm256_setr_epi8( |
286 | 0 | /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2, |
287 | 0 | /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3, |
288 | 0 | /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3, |
289 | 0 | /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4, |
290 | 0 |
|
291 | 0 | /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2, |
292 | 0 | /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3, |
293 | 0 | /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3, |
294 | 0 | /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4); |
295 | 0 | const __m256i low_mask = _mm256_set1_epi8(0x0f); |
296 | 0 |
|
297 | 0 | const __m256i lo = _mm256_and_si256(v, low_mask); |
298 | 0 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); |
299 | 0 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo); |
300 | 0 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi); |
301 | 0 | return _mm256_sad_epu8(popcnt1, popcnt2); |
302 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL11popcount256EDv4_x Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL11popcount256EDv4_x Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL11popcount256EDv4_x Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL11popcount256EDv4_x Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL11popcount256EDv4_x Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL11popcount256EDv4_x Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL11popcount256EDv4_x |
303 | | CROARING_UNTARGET_AVX2 |
304 | | |
305 | | CROARING_TARGET_AVX2 |
306 | | /** |
307 | | * Simple CSA over 256 bits |
308 | | */ |
309 | | static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b, |
310 | 0 | __m256i c) { |
311 | 0 | const __m256i u = _mm256_xor_si256(a, b); |
312 | 0 | *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); |
313 | 0 | *l = _mm256_xor_si256(u, c); |
314 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL3CSAEPDv4_xS2_S1_S1_S1_ |
315 | | CROARING_UNTARGET_AVX2 |
316 | | |
317 | | CROARING_TARGET_AVX2 |
318 | | /** |
319 | | * Fast Harley-Seal AVX population count function |
320 | | */ |
321 | | inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data, |
322 | 0 | const uint64_t size) { |
323 | 0 | __m256i total = _mm256_setzero_si256(); |
324 | 0 | __m256i ones = _mm256_setzero_si256(); |
325 | 0 | __m256i twos = _mm256_setzero_si256(); |
326 | 0 | __m256i fours = _mm256_setzero_si256(); |
327 | 0 | __m256i eights = _mm256_setzero_si256(); |
328 | 0 | __m256i sixteens = _mm256_setzero_si256(); |
329 | 0 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; |
330 | 0 |
|
331 | 0 | const uint64_t limit = size - size % 16; |
332 | 0 | uint64_t i = 0; |
333 | 0 |
|
334 | 0 | for (; i < limit; i += 16) { |
335 | 0 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i), |
336 | 0 | _mm256_lddqu_si256(data + i + 1)); |
337 | 0 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2), |
338 | 0 | _mm256_lddqu_si256(data + i + 3)); |
339 | 0 | CSA(&foursA, &twos, twos, twosA, twosB); |
340 | 0 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4), |
341 | 0 | _mm256_lddqu_si256(data + i + 5)); |
342 | 0 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6), |
343 | 0 | _mm256_lddqu_si256(data + i + 7)); |
344 | 0 | CSA(&foursB, &twos, twos, twosA, twosB); |
345 | 0 | CSA(&eightsA, &fours, fours, foursA, foursB); |
346 | 0 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8), |
347 | 0 | _mm256_lddqu_si256(data + i + 9)); |
348 | 0 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10), |
349 | 0 | _mm256_lddqu_si256(data + i + 11)); |
350 | 0 | CSA(&foursA, &twos, twos, twosA, twosB); |
351 | 0 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12), |
352 | 0 | _mm256_lddqu_si256(data + i + 13)); |
353 | 0 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14), |
354 | 0 | _mm256_lddqu_si256(data + i + 15)); |
355 | 0 | CSA(&foursB, &twos, twos, twosA, twosB); |
356 | 0 | CSA(&eightsB, &fours, fours, foursA, foursB); |
357 | 0 | CSA(&sixteens, &eights, eights, eightsA, eightsB); |
358 | 0 |
|
359 | 0 | total = _mm256_add_epi64(total, popcount256(sixteens)); |
360 | 0 | } |
361 | 0 |
|
362 | 0 | total = _mm256_slli_epi64(total, 4); // * 16 |
363 | 0 | total = _mm256_add_epi64( |
364 | 0 | total, _mm256_slli_epi64(popcount256(eights), 3)); // += 8 * ... |
365 | 0 | total = _mm256_add_epi64( |
366 | 0 | total, _mm256_slli_epi64(popcount256(fours), 2)); // += 4 * ... |
367 | 0 | total = _mm256_add_epi64( |
368 | 0 | total, _mm256_slli_epi64(popcount256(twos), 1)); // += 2 * ... |
369 | 0 | total = _mm256_add_epi64(total, popcount256(ones)); |
370 | 0 | for (; i < size; i++) |
371 | 0 | total = |
372 | 0 | _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i))); |
373 | 0 |
|
374 | 0 | return (uint64_t)(_mm256_extract_epi64(total, 0)) + |
375 | 0 | (uint64_t)(_mm256_extract_epi64(total, 1)) + |
376 | 0 | (uint64_t)(_mm256_extract_epi64(total, 2)) + |
377 | 0 | (uint64_t)(_mm256_extract_epi64(total, 3)); |
378 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL28avx2_harley_seal_popcount256EPKDv4_xm |
379 | | CROARING_UNTARGET_AVX2 |
380 | | |
381 | | #define AVXPOPCNTFNC(opname, avx_intrinsic) \ |
382 | | static inline uint64_t avx2_harley_seal_popcount256_##opname( \ |
383 | 0 | const __m256i *data1, const __m256i *data2, const uint64_t size) { \ |
384 | 0 | __m256i total = _mm256_setzero_si256(); \ |
385 | 0 | __m256i ones = _mm256_setzero_si256(); \ |
386 | 0 | __m256i twos = _mm256_setzero_si256(); \ |
387 | 0 | __m256i fours = _mm256_setzero_si256(); \ |
388 | 0 | __m256i eights = _mm256_setzero_si256(); \ |
389 | 0 | __m256i sixteens = _mm256_setzero_si256(); \ |
390 | 0 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \ |
391 | 0 | __m256i A1, A2; \ |
392 | 0 | const uint64_t limit = size - size % 16; \ |
393 | 0 | uint64_t i = 0; \ |
394 | 0 | for (; i < limit; i += 16) { \ |
395 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
396 | 0 | _mm256_lddqu_si256(data2 + i)); \ |
397 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \ |
398 | 0 | _mm256_lddqu_si256(data2 + i + 1)); \ |
399 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
400 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \ |
401 | 0 | _mm256_lddqu_si256(data2 + i + 2)); \ |
402 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \ |
403 | 0 | _mm256_lddqu_si256(data2 + i + 3)); \ |
404 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
405 | 0 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
406 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \ |
407 | 0 | _mm256_lddqu_si256(data2 + i + 4)); \ |
408 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \ |
409 | 0 | _mm256_lddqu_si256(data2 + i + 5)); \ |
410 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
411 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \ |
412 | 0 | _mm256_lddqu_si256(data2 + i + 6)); \ |
413 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \ |
414 | 0 | _mm256_lddqu_si256(data2 + i + 7)); \ |
415 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
416 | 0 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
417 | 0 | CSA(&eightsA, &fours, fours, foursA, foursB); \ |
418 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \ |
419 | 0 | _mm256_lddqu_si256(data2 + i + 8)); \ |
420 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \ |
421 | 0 | _mm256_lddqu_si256(data2 + i + 9)); \ |
422 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
423 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \ |
424 | 0 | _mm256_lddqu_si256(data2 + i + 10)); \ |
425 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \ |
426 | 0 | _mm256_lddqu_si256(data2 + i + 11)); \ |
427 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
428 | 0 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
429 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \ |
430 | 0 | _mm256_lddqu_si256(data2 + i + 12)); \ |
431 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \ |
432 | 0 | _mm256_lddqu_si256(data2 + i + 13)); \ |
433 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
434 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \ |
435 | 0 | _mm256_lddqu_si256(data2 + i + 14)); \ |
436 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \ |
437 | 0 | _mm256_lddqu_si256(data2 + i + 15)); \ |
438 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
439 | 0 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
440 | 0 | CSA(&eightsB, &fours, fours, foursA, foursB); \ |
441 | 0 | CSA(&sixteens, &eights, eights, eightsA, eightsB); \ |
442 | 0 | total = _mm256_add_epi64(total, popcount256(sixteens)); \ |
443 | 0 | } \ |
444 | 0 | total = _mm256_slli_epi64(total, 4); \ |
445 | 0 | total = _mm256_add_epi64(total, \ |
446 | 0 | _mm256_slli_epi64(popcount256(eights), 3)); \ |
447 | 0 | total = \ |
448 | 0 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \ |
449 | 0 | total = \ |
450 | 0 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \ |
451 | 0 | total = _mm256_add_epi64(total, popcount256(ones)); \ |
452 | 0 | for (; i < size; i++) { \ |
453 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
454 | 0 | _mm256_lddqu_si256(data2 + i)); \ |
455 | 0 | total = _mm256_add_epi64(total, popcount256(A1)); \ |
456 | 0 | } \ |
457 | 0 | return (uint64_t)(_mm256_extract_epi64(total, 0)) + \ |
458 | 0 | (uint64_t)(_mm256_extract_epi64(total, 1)) + \ |
459 | 0 | (uint64_t)(_mm256_extract_epi64(total, 2)) + \ |
460 | 0 | (uint64_t)(_mm256_extract_epi64(total, 3)); \ |
461 | 0 | } \ Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL31avx2_harley_seal_popcount256_orEPKDv4_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL34avx2_harley_seal_popcount256_unionEPKDv4_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_andEPKDv4_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL41avx2_harley_seal_popcount256_intersectionEPKDv4_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL32avx2_harley_seal_popcount256_xorEPKDv4_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL35avx2_harley_seal_popcount256_andnotEPKDv4_xS3_m |
462 | | static inline uint64_t avx2_harley_seal_popcount256andstore_##opname( \ |
463 | | const __m256i *__restrict__ data1, const __m256i *__restrict__ data2, \ |
464 | 0 | __m256i *__restrict__ out, const uint64_t size) { \ |
465 | 0 | __m256i total = _mm256_setzero_si256(); \ |
466 | 0 | __m256i ones = _mm256_setzero_si256(); \ |
467 | 0 | __m256i twos = _mm256_setzero_si256(); \ |
468 | 0 | __m256i fours = _mm256_setzero_si256(); \ |
469 | 0 | __m256i eights = _mm256_setzero_si256(); \ |
470 | 0 | __m256i sixteens = _mm256_setzero_si256(); \ |
471 | 0 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \ |
472 | 0 | __m256i A1, A2; \ |
473 | 0 | const uint64_t limit = size - size % 16; \ |
474 | 0 | uint64_t i = 0; \ |
475 | 0 | for (; i < limit; i += 16) { \ |
476 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
477 | 0 | _mm256_lddqu_si256(data2 + i)); \ |
478 | 0 | _mm256_storeu_si256(out + i, A1); \ |
479 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \ |
480 | 0 | _mm256_lddqu_si256(data2 + i + 1)); \ |
481 | 0 | _mm256_storeu_si256(out + i + 1, A2); \ |
482 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
483 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \ |
484 | 0 | _mm256_lddqu_si256(data2 + i + 2)); \ |
485 | 0 | _mm256_storeu_si256(out + i + 2, A1); \ |
486 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \ |
487 | 0 | _mm256_lddqu_si256(data2 + i + 3)); \ |
488 | 0 | _mm256_storeu_si256(out + i + 3, A2); \ |
489 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
490 | 0 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
491 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \ |
492 | 0 | _mm256_lddqu_si256(data2 + i + 4)); \ |
493 | 0 | _mm256_storeu_si256(out + i + 4, A1); \ |
494 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \ |
495 | 0 | _mm256_lddqu_si256(data2 + i + 5)); \ |
496 | 0 | _mm256_storeu_si256(out + i + 5, A2); \ |
497 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
498 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \ |
499 | 0 | _mm256_lddqu_si256(data2 + i + 6)); \ |
500 | 0 | _mm256_storeu_si256(out + i + 6, A1); \ |
501 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \ |
502 | 0 | _mm256_lddqu_si256(data2 + i + 7)); \ |
503 | 0 | _mm256_storeu_si256(out + i + 7, A2); \ |
504 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
505 | 0 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
506 | 0 | CSA(&eightsA, &fours, fours, foursA, foursB); \ |
507 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \ |
508 | 0 | _mm256_lddqu_si256(data2 + i + 8)); \ |
509 | 0 | _mm256_storeu_si256(out + i + 8, A1); \ |
510 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \ |
511 | 0 | _mm256_lddqu_si256(data2 + i + 9)); \ |
512 | 0 | _mm256_storeu_si256(out + i + 9, A2); \ |
513 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
514 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \ |
515 | 0 | _mm256_lddqu_si256(data2 + i + 10)); \ |
516 | 0 | _mm256_storeu_si256(out + i + 10, A1); \ |
517 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \ |
518 | 0 | _mm256_lddqu_si256(data2 + i + 11)); \ |
519 | 0 | _mm256_storeu_si256(out + i + 11, A2); \ |
520 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
521 | 0 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
522 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \ |
523 | 0 | _mm256_lddqu_si256(data2 + i + 12)); \ |
524 | 0 | _mm256_storeu_si256(out + i + 12, A1); \ |
525 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \ |
526 | 0 | _mm256_lddqu_si256(data2 + i + 13)); \ |
527 | 0 | _mm256_storeu_si256(out + i + 13, A2); \ |
528 | 0 | CSA(&twosA, &ones, ones, A1, A2); \ |
529 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \ |
530 | 0 | _mm256_lddqu_si256(data2 + i + 14)); \ |
531 | 0 | _mm256_storeu_si256(out + i + 14, A1); \ |
532 | 0 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \ |
533 | 0 | _mm256_lddqu_si256(data2 + i + 15)); \ |
534 | 0 | _mm256_storeu_si256(out + i + 15, A2); \ |
535 | 0 | CSA(&twosB, &ones, ones, A1, A2); \ |
536 | 0 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
537 | 0 | CSA(&eightsB, &fours, fours, foursA, foursB); \ |
538 | 0 | CSA(&sixteens, &eights, eights, eightsA, eightsB); \ |
539 | 0 | total = _mm256_add_epi64(total, popcount256(sixteens)); \ |
540 | 0 | } \ |
541 | 0 | total = _mm256_slli_epi64(total, 4); \ |
542 | 0 | total = _mm256_add_epi64(total, \ |
543 | 0 | _mm256_slli_epi64(popcount256(eights), 3)); \ |
544 | 0 | total = \ |
545 | 0 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \ |
546 | 0 | total = \ |
547 | 0 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \ |
548 | 0 | total = _mm256_add_epi64(total, popcount256(ones)); \ |
549 | 0 | for (; i < size; i++) { \ |
550 | 0 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
551 | 0 | _mm256_lddqu_si256(data2 + i)); \ |
552 | 0 | _mm256_storeu_si256(out + i, A1); \ |
553 | 0 | total = _mm256_add_epi64(total, popcount256(A1)); \ |
554 | 0 | } \ |
555 | 0 | return (uint64_t)(_mm256_extract_epi64(total, 0)) + \ |
556 | 0 | (uint64_t)(_mm256_extract_epi64(total, 1)) + \ |
557 | 0 | (uint64_t)(_mm256_extract_epi64(total, 2)) + \ |
558 | 0 | (uint64_t)(_mm256_extract_epi64(total, 3)); \ |
559 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL39avx2_harley_seal_popcount256andstore_orEPKDv4_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL42avx2_harley_seal_popcount256andstore_unionEPKDv4_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_andEPKDv4_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL49avx2_harley_seal_popcount256andstore_intersectionEPKDv4_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL40avx2_harley_seal_popcount256andstore_xorEPKDv4_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL43avx2_harley_seal_popcount256andstore_andnotEPKDv4_xS3_PS1_m |
560 | | |
561 | | CROARING_TARGET_AVX2 |
562 | | AVXPOPCNTFNC(or, _mm256_or_si256) |
563 | | CROARING_UNTARGET_AVX2 |
564 | | |
565 | | CROARING_TARGET_AVX2 |
566 | | AVXPOPCNTFNC(union, _mm256_or_si256) |
567 | | CROARING_UNTARGET_AVX2 |
568 | | |
569 | | CROARING_TARGET_AVX2 |
570 | | AVXPOPCNTFNC(and, _mm256_and_si256) |
571 | | CROARING_UNTARGET_AVX2 |
572 | | |
573 | | CROARING_TARGET_AVX2 |
574 | | AVXPOPCNTFNC(intersection, _mm256_and_si256) |
575 | | CROARING_UNTARGET_AVX2 |
576 | | |
577 | | CROARING_TARGET_AVX2 |
578 | | AVXPOPCNTFNC (xor, _mm256_xor_si256) |
579 | | CROARING_UNTARGET_AVX2 |
580 | | |
581 | | CROARING_TARGET_AVX2 |
582 | | AVXPOPCNTFNC(andnot, _mm256_andnot_si256) |
583 | | CROARING_UNTARGET_AVX2 |
584 | | |
585 | | |
586 | | #define VPOPCNT_AND_ADD(ptr, i, accu) \ |
587 | | const __m512i v##i = _mm512_loadu_si512((const __m512i*)ptr + i); \ |
588 | | const __m512i p##i = _mm512_popcnt_epi64(v##i); \ |
589 | | accu = _mm512_add_epi64(accu, p##i); |
590 | | |
591 | | #if CROARING_COMPILER_SUPPORTS_AVX512 |
592 | | CROARING_TARGET_AVX512 |
593 | 0 | static inline uint64_t sum_epu64_256(const __m256i v) { |
594 | 0 |
|
595 | 0 | return (uint64_t)(_mm256_extract_epi64(v, 0)) |
596 | 0 | + (uint64_t)(_mm256_extract_epi64(v, 1)) |
597 | 0 | + (uint64_t)(_mm256_extract_epi64(v, 2)) |
598 | 0 | + (uint64_t)(_mm256_extract_epi64(v, 3)); |
599 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL13sum_epu64_256EDv4_x |
600 | | |
601 | | |
602 | 0 | static inline uint64_t simd_sum_epu64(const __m512i v) { |
603 | 0 |
|
604 | 0 | __m256i lo = _mm512_extracti64x4_epi64(v, 0); |
605 | 0 | __m256i hi = _mm512_extracti64x4_epi64(v, 1); |
606 | 0 |
|
607 | 0 | return sum_epu64_256(lo) + sum_epu64_256(hi); |
608 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL14simd_sum_epu64EDv8_x |
609 | | |
610 | | static inline uint64_t avx512_vpopcount(const __m512i* data, const uint64_t size) |
611 | 0 | { |
612 | 0 | const uint64_t limit = size - size % 4; |
613 | 0 | __m512i total = _mm512_setzero_si512(); |
614 | 0 | uint64_t i = 0; |
615 | 0 |
|
616 | 0 | for (; i < limit; i += 4) |
617 | 0 | { |
618 | 0 | VPOPCNT_AND_ADD(data + i, 0, total); |
619 | 0 | VPOPCNT_AND_ADD(data + i, 1, total); |
620 | 0 | VPOPCNT_AND_ADD(data + i, 2, total); |
621 | 0 | VPOPCNT_AND_ADD(data + i, 3, total); |
622 | 0 | } |
623 | 0 | |
624 | 0 | for (; i < size; i++) |
625 | 0 | { |
626 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(_mm512_loadu_si512(data + i))); |
627 | 0 | } |
628 | 0 | |
629 | 0 | return simd_sum_epu64(total); |
630 | 0 | } Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL16avx512_vpopcountEPKDv8_xm |
631 | | CROARING_UNTARGET_AVX512 |
632 | | #endif |
633 | | |
634 | | #define AVXPOPCNTFNC512(opname, avx_intrinsic) \ |
635 | | static inline uint64_t avx512_harley_seal_popcount512_##opname( \ |
636 | 0 | const __m512i *data1, const __m512i *data2, const uint64_t size) { \ |
637 | 0 | __m512i total = _mm512_setzero_si512(); \ |
638 | 0 | const uint64_t limit = size - size % 4; \ |
639 | 0 | uint64_t i = 0; \ |
640 | 0 | for (; i < limit; i += 4) { \ |
641 | 0 | __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ |
642 | 0 | _mm512_loadu_si512(data2 + i)); \ |
643 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \ |
644 | 0 | __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \ |
645 | 0 | _mm512_loadu_si512(data2 + i + 1)); \ |
646 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \ |
647 | 0 | __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \ |
648 | 0 | _mm512_loadu_si512(data2 + i + 2)); \ |
649 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \ |
650 | 0 | __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \ |
651 | 0 | _mm512_loadu_si512(data2 + i + 3)); \ |
652 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \ |
653 | 0 | } \ |
654 | 0 | for(; i < size; i++) { \ |
655 | 0 | __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ |
656 | 0 | _mm512_loadu_si512(data2 + i)); \ |
657 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \ |
658 | 0 | } \ |
659 | 0 | return simd_sum_epu64(total); \ |
660 | 0 | } \ Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL33avx512_harley_seal_popcount512_orEPKDv8_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL36avx512_harley_seal_popcount512_unionEPKDv8_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_andEPKDv8_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL43avx512_harley_seal_popcount512_intersectionEPKDv8_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL34avx512_harley_seal_popcount512_xorEPKDv8_xS3_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL37avx512_harley_seal_popcount512_andnotEPKDv8_xS3_m |
661 | | static inline uint64_t avx512_harley_seal_popcount512andstore_##opname( \ |
662 | | const __m512i *__restrict__ data1, const __m512i *__restrict__ data2, \ |
663 | 0 | __m512i *__restrict__ out, const uint64_t size) { \ |
664 | 0 | __m512i total = _mm512_setzero_si512(); \ |
665 | 0 | const uint64_t limit = size - size % 4; \ |
666 | 0 | uint64_t i = 0; \ |
667 | 0 | for (; i < limit; i += 4) { \ |
668 | 0 | __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ |
669 | 0 | _mm512_loadu_si512(data2 + i)); \ |
670 | 0 | _mm512_storeu_si512(out + i, a1); \ |
671 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \ |
672 | 0 | __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \ |
673 | 0 | _mm512_loadu_si512(data2 + i + 1)); \ |
674 | 0 | _mm512_storeu_si512(out + i + 1, a2); \ |
675 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \ |
676 | 0 | __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \ |
677 | 0 | _mm512_loadu_si512(data2 + i + 2)); \ |
678 | 0 | _mm512_storeu_si512(out + i + 2, a3); \ |
679 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \ |
680 | 0 | __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \ |
681 | 0 | _mm512_loadu_si512(data2 + i + 3)); \ |
682 | 0 | _mm512_storeu_si512(out + i + 3, a4); \ |
683 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \ |
684 | 0 | } \ |
685 | 0 | for(; i < size; i++) { \ |
686 | 0 | __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ |
687 | 0 | _mm512_loadu_si512(data2 + i)); \ |
688 | 0 | _mm512_storeu_si512(out + i, a); \ |
689 | 0 | total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \ |
690 | 0 | } \ |
691 | 0 | return simd_sum_epu64(total); \ |
692 | 0 | } \ Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_writer.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: bkd_reader.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: packed_index_tree.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: index_tree.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: legacy_index_tree.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: docids_writer.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL41avx512_harley_seal_popcount512andstore_orEPKDv8_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL44avx512_harley_seal_popcount512andstore_unionEPKDv8_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_andEPKDv8_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL51avx512_harley_seal_popcount512andstore_intersectionEPKDv8_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL42avx512_harley_seal_popcount512andstore_xorEPKDv8_xS3_PS1_m Unexecuted instantiation: IndexWriter.cpp:_ZN7roaring8internalL45avx512_harley_seal_popcount512andstore_andnotEPKDv8_xS3_PS1_m |
693 | | |
694 | | #if CROARING_COMPILER_SUPPORTS_AVX512 |
695 | | CROARING_TARGET_AVX512 |
696 | | AVXPOPCNTFNC512(or, _mm512_or_si512) |
697 | | AVXPOPCNTFNC512(union, _mm512_or_si512) |
698 | | AVXPOPCNTFNC512(and, _mm512_and_si512) |
699 | | AVXPOPCNTFNC512(intersection, _mm512_and_si512) |
700 | | AVXPOPCNTFNC512(xor, _mm512_xor_si512) |
701 | | AVXPOPCNTFNC512(andnot, _mm512_andnot_si512) |
702 | | CROARING_UNTARGET_AVX512 |
703 | | #endif |
704 | | /*** |
705 | | * END Harley-Seal popcount functions. |
706 | | */ |
707 | | |
708 | | #endif // CROARING_IS_X64 |
709 | | |
710 | | #ifdef __cplusplus |
711 | | } } } // extern "C" { namespace roaring { namespace internal |
712 | | #endif |
713 | | |
714 | | #endif |