/root/doris/contrib/faiss/faiss/utils/partitioning.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #include <faiss/utils/partitioning.h> |
9 | | |
10 | | #include <cassert> |
11 | | #include <cmath> |
12 | | |
13 | | #include <faiss/impl/FaissAssert.h> |
14 | | #include <faiss/utils/AlignedTable.h> |
15 | | #include <faiss/utils/ordered_key_value.h> |
16 | | #include <faiss/utils/simdlib.h> |
17 | | |
18 | | #include <faiss/impl/platform_macros.h> |
19 | | |
20 | | namespace faiss { |
21 | | |
22 | | /****************************************************************** |
23 | | * Internal routines |
24 | | ******************************************************************/ |
25 | | |
26 | | namespace partitioning { |
27 | | |
28 | | template <typename T> |
29 | 0 | T median3(T a, T b, T c) { |
30 | 0 | if (a > b) { |
31 | 0 | std::swap(a, b); |
32 | 0 | } |
33 | 0 | if (c > b) { |
34 | 0 | return b; |
35 | 0 | } |
36 | 0 | if (c > a) { |
37 | 0 | return c; |
38 | 0 | } |
39 | 0 | return a; |
40 | 0 | } Unexecuted instantiation: _ZN5faiss12partitioning7median3IfEET_S2_S2_S2_ Unexecuted instantiation: _ZN5faiss12partitioning7median3ItEET_S2_S2_S2_ |
41 | | |
42 | | template <class C> |
43 | | typename C::T sample_threshold_median3( |
44 | | const typename C::T* vals, |
45 | | int n, |
46 | | typename C::T thresh_inf, |
47 | 0 | typename C::T thresh_sup) { |
48 | 0 | using T = typename C::T; |
49 | 0 | size_t big_prime = 6700417; |
50 | 0 | T val3[3]; |
51 | 0 | int vi = 0; |
52 | |
|
53 | 0 | for (size_t i = 0; i < n; i++) { |
54 | 0 | T v = vals[(i * big_prime) % n]; |
55 | | // thresh_inf < v < thresh_sup (for CMax) |
56 | 0 | if (C::cmp(v, thresh_inf) && C::cmp(thresh_sup, v)) { |
57 | 0 | val3[vi++] = v; |
58 | 0 | if (vi == 3) { |
59 | 0 | break; |
60 | 0 | } |
61 | 0 | } |
62 | 0 | } |
63 | |
|
64 | 0 | if (vi == 3) { |
65 | 0 | return median3(val3[0], val3[1], val3[2]); |
66 | 0 | } else if (vi != 0) { |
67 | 0 | return val3[0]; |
68 | 0 | } else { |
69 | 0 | return thresh_inf; |
70 | | // FAISS_THROW_MSG("too few values to compute a median"); |
71 | 0 | } |
72 | 0 | } Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMinIflEEEENT_1TEPKS5_iS5_S5_ Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMaxIflEEEENT_1TEPKS5_iS5_S5_ Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMinItlEEEENT_1TEPKS5_iS5_S5_ Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMaxItlEEEENT_1TEPKS5_iS5_S5_ Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMinItiEEEENT_1TEPKS5_iS5_S5_ Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMaxItiEEEENT_1TEPKS5_iS5_S5_ |
73 | | |
74 | | template <class C> |
75 | | void count_lt_and_eq( |
76 | | const typename C::T* vals, |
77 | | size_t n, |
78 | | typename C::T thresh, |
79 | | size_t& n_lt, |
80 | 0 | size_t& n_eq) { |
81 | 0 | n_lt = n_eq = 0; |
82 | |
|
83 | 0 | for (size_t i = 0; i < n; i++) { |
84 | 0 | typename C::T v = *vals++; |
85 | 0 | if (C::cmp(thresh, v)) { |
86 | 0 | n_lt++; |
87 | 0 | } else if (v == thresh) { |
88 | 0 | n_eq++; |
89 | 0 | } |
90 | 0 | } |
91 | 0 | } Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMinIflEEEEvPKNT_1TEmS5_RmS8_ Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMaxIflEEEEvPKNT_1TEmS5_RmS8_ Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMinItlEEEEvPKNT_1TEmS5_RmS8_ Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMaxItlEEEEvPKNT_1TEmS5_RmS8_ Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMinItiEEEEvPKNT_1TEmS5_RmS8_ Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMaxItiEEEEvPKNT_1TEmS5_RmS8_ |
92 | | |
93 | | template <class C> |
94 | | size_t compress_array( |
95 | | typename C::T* vals, |
96 | | typename C::TI* ids, |
97 | | size_t n, |
98 | | typename C::T thresh, |
99 | 0 | size_t n_eq) { |
100 | 0 | size_t wp = 0; |
101 | 0 | for (size_t i = 0; i < n; i++) { |
102 | 0 | if (C::cmp(thresh, vals[i])) { |
103 | 0 | vals[wp] = vals[i]; |
104 | 0 | ids[wp] = ids[i]; |
105 | 0 | wp++; |
106 | 0 | } else if (n_eq > 0 && vals[i] == thresh) { |
107 | 0 | vals[wp] = vals[i]; |
108 | 0 | ids[wp] = ids[i]; |
109 | 0 | wp++; |
110 | 0 | n_eq--; |
111 | 0 | } |
112 | 0 | } |
113 | 0 | assert(n_eq == 0); |
114 | 0 | return wp; |
115 | 0 | } Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMinIflEEEEmPNT_1TEPNS4_2TIEmS5_m Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMaxIflEEEEmPNT_1TEPNS4_2TIEmS5_m Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMinItlEEEEmPNT_1TEPNS4_2TIEmS5_m Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMaxItlEEEEmPNT_1TEPNS4_2TIEmS5_m Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMinItiEEEEmPNT_1TEPNS4_2TIEmS5_m Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMaxItiEEEEmPNT_1TEPNS4_2TIEmS5_m |
116 | | |
117 | 0 | #define IFV if (false) |
118 | | |
119 | | template <class C> |
120 | | typename C::T partition_fuzzy_median3( |
121 | | typename C::T* vals, |
122 | | typename C::TI* ids, |
123 | | size_t n, |
124 | | size_t q_min, |
125 | | size_t q_max, |
126 | 0 | size_t* q_out) { |
127 | 0 | if (q_min == 0) { |
128 | 0 | if (q_out) { |
129 | 0 | *q_out = C::Crev::neutral(); |
130 | 0 | } |
131 | 0 | return 0; |
132 | 0 | } |
133 | 0 | if (q_max >= n) { |
134 | 0 | if (q_out) { |
135 | 0 | *q_out = q_max; |
136 | 0 | } |
137 | 0 | return C::neutral(); |
138 | 0 | } |
139 | | |
140 | 0 | using T = typename C::T; |
141 | | |
142 | | // here we use bissection with a median of 3 to find the threshold and |
143 | | // compress the arrays afterwards. So it's a n*log(n) algoirithm rather than |
144 | | // qselect's O(n) but it avoids shuffling around the array. |
145 | |
|
146 | 0 | FAISS_THROW_IF_NOT(n >= 3); |
147 | | |
148 | 0 | T thresh_inf = C::Crev::neutral(); |
149 | 0 | T thresh_sup = C::neutral(); |
150 | 0 | T thresh = median3(vals[0], vals[n / 2], vals[n - 1]); |
151 | |
|
152 | 0 | size_t n_eq = 0, n_lt = 0; |
153 | 0 | size_t q = 0; |
154 | |
|
155 | 0 | for (int it = 0; it < 200; it++) { |
156 | 0 | count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq); |
157 | |
|
158 | 0 | IFV printf( |
159 | 0 | " thresh=%g [%g %g] n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n", |
160 | 0 | float(thresh), |
161 | 0 | float(thresh_inf), |
162 | 0 | float(thresh_sup), |
163 | 0 | long(n_lt), |
164 | 0 | long(n_eq), |
165 | 0 | long(q_min), |
166 | 0 | long(q_max), |
167 | 0 | long(n)); |
168 | |
|
169 | 0 | if (n_lt <= q_min) { |
170 | 0 | if (n_lt + n_eq >= q_min) { |
171 | 0 | q = q_min; |
172 | 0 | break; |
173 | 0 | } else { |
174 | 0 | thresh_inf = thresh; |
175 | 0 | } |
176 | 0 | } else if (n_lt <= q_max) { |
177 | 0 | q = n_lt; |
178 | 0 | break; |
179 | 0 | } else { |
180 | 0 | thresh_sup = thresh; |
181 | 0 | } |
182 | | |
183 | | // FIXME avoid a second pass over the array to sample the threshold |
184 | 0 | IFV printf( |
185 | 0 | " sample thresh in [%g %g]\n", |
186 | 0 | float(thresh_inf), |
187 | 0 | float(thresh_sup)); |
188 | 0 | T new_thresh = |
189 | 0 | sample_threshold_median3<C>(vals, n, thresh_inf, thresh_sup); |
190 | 0 | if (new_thresh == thresh_inf) { |
191 | | // then there is nothing between thresh_inf and thresh_sup |
192 | 0 | break; |
193 | 0 | } |
194 | 0 | thresh = new_thresh; |
195 | 0 | } |
196 | |
|
197 | 0 | int64_t n_eq_1 = q - n_lt; |
198 | |
|
199 | 0 | IFV printf("shrink: thresh=%g n_eq_1=%ld\n", float(thresh), long(n_eq_1)); |
200 | |
|
201 | 0 | if (n_eq_1 < 0) { // happens when > q elements are at lower bound |
202 | 0 | q = q_min; |
203 | 0 | thresh = C::Crev::nextafter(thresh); |
204 | 0 | n_eq_1 = q; |
205 | 0 | } else { |
206 | 0 | assert(n_eq_1 <= n_eq); |
207 | 0 | } |
208 | | |
209 | 0 | [[maybe_unused]] const int wp = |
210 | 0 | compress_array<C>(vals, ids, n, thresh, n_eq_1); |
211 | |
|
212 | 0 | assert(wp == q); |
213 | 0 | if (q_out) { |
214 | 0 | *q_out = q; |
215 | 0 | } |
216 | |
|
217 | 0 | return thresh; |
218 | 0 | } Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMinIflEEEENT_1TEPS5_PNS4_2TIEmmmPm Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMaxIflEEEENT_1TEPS5_PNS4_2TIEmmmPm Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMinItlEEEENT_1TEPS5_PNS4_2TIEmmmPm Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMaxItlEEEENT_1TEPS5_PNS4_2TIEmmmPm Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMinItiEEEENT_1TEPS5_PNS4_2TIEmmmPm Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMaxItiEEEENT_1TEPS5_PNS4_2TIEmmmPm |
219 | | |
220 | | } // namespace partitioning |
221 | | |
222 | | /****************************************************************** |
223 | | * SIMD routines when vals is an aligned array of uint16_t |
224 | | ******************************************************************/ |
225 | | |
226 | | namespace simd_partitioning { |
227 | | |
228 | | void find_minimax( |
229 | | const uint16_t* vals, |
230 | | size_t n, |
231 | | uint16_t& smin, |
232 | 0 | uint16_t& smax) { |
233 | 0 | simd16uint16 vmin(0xffff), vmax(0); |
234 | 0 | for (size_t i = 0; i + 15 < n; i += 16) { |
235 | 0 | simd16uint16 v(vals + i); |
236 | 0 | vmin.accu_min(v); |
237 | 0 | vmax.accu_max(v); |
238 | 0 | } |
239 | |
|
240 | 0 | ALIGNED(32) uint16_t tab32[32]; |
241 | 0 | vmin.store(tab32); |
242 | 0 | vmax.store(tab32 + 16); |
243 | |
|
244 | 0 | smin = tab32[0], smax = tab32[16]; |
245 | |
|
246 | 0 | for (int i = 1; i < 16; i++) { |
247 | 0 | smin = std::min(smin, tab32[i]); |
248 | 0 | smax = std::max(smax, tab32[i + 16]); |
249 | 0 | } |
250 | | |
251 | | // missing values |
252 | 0 | for (size_t i = (n & ~15); i < n; i++) { |
253 | 0 | smin = std::min(smin, vals[i]); |
254 | 0 | smax = std::max(smax, vals[i]); |
255 | 0 | } |
256 | 0 | } |
257 | | |
258 | | // max func differentiates between CMin and CMax (keep lowest or largest) |
259 | | template <class C> |
260 | 0 | simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) { |
261 | 0 | constexpr bool is_max = C::is_max; |
262 | 0 | if (is_max) { |
263 | 0 | return max(v, thr16); |
264 | 0 | } else { |
265 | 0 | return min(v, thr16); |
266 | 0 | } |
267 | 0 | } Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMinItlEEEENS_12simd16uint16ES4_S4_ Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMaxItlEEEENS_12simd16uint16ES4_S4_ Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMinItiEEEENS_12simd16uint16ES4_S4_ Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMaxItiEEEENS_12simd16uint16ES4_S4_ Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMinIflEEEENS_12simd16uint16ES4_S4_ Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMaxIflEEEENS_12simd16uint16ES4_S4_ |
268 | | |
269 | | template <class C> |
270 | | void count_lt_and_eq( |
271 | | const uint16_t* vals, |
272 | | int n, |
273 | | uint16_t thresh, |
274 | | size_t& n_lt, |
275 | 0 | size_t& n_eq) { |
276 | 0 | n_lt = n_eq = 0; |
277 | 0 | simd16uint16 thr16(thresh); |
278 | |
|
279 | 0 | size_t n1 = n / 16; |
280 | |
|
281 | 0 | for (size_t i = 0; i < n1; i++) { |
282 | 0 | simd16uint16 v(vals); |
283 | 0 | vals += 16; |
284 | 0 | simd16uint16 eqmask = (v == thr16); |
285 | 0 | simd16uint16 max2 = max_func<C>(v, thr16); |
286 | 0 | simd16uint16 gemask = (v == max2); |
287 | 0 | uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask)); |
288 | 0 | int i_eq = __builtin_popcount(bits & 0x00ff00ff); |
289 | 0 | int i_ge = __builtin_popcount(bits) - i_eq; |
290 | 0 | n_eq += i_eq; |
291 | 0 | n_lt += 16 - i_ge; |
292 | 0 | } |
293 | |
|
294 | 0 | for (size_t i = n1 * 16; i < n; i++) { |
295 | 0 | uint16_t v = *vals++; |
296 | 0 | if (C::cmp(thresh, v)) { |
297 | 0 | n_lt++; |
298 | 0 | } else if (v == thresh) { |
299 | 0 | n_eq++; |
300 | 0 | } |
301 | 0 | } |
302 | 0 | } Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMinItlEEEEvPKtitRmS6_ Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMaxItlEEEEvPKtitRmS6_ Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMinItiEEEEvPKtitRmS6_ Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMaxItiEEEEvPKtitRmS6_ Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMinIflEEEEvPKtitRmS6_ Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMaxIflEEEEvPKtitRmS6_ |
303 | | |
304 | | /* compress separated values and ids table, keeping all values < thresh and at |
305 | | * most n_eq equal values */ |
306 | | template <class C> |
307 | | int simd_compress_array( |
308 | | uint16_t* vals, |
309 | | typename C::TI* ids, |
310 | | size_t n, |
311 | | uint16_t thresh, |
312 | 0 | int n_eq) { |
313 | 0 | simd16uint16 thr16(thresh); |
314 | 0 | simd16uint16 mixmask(0xff00); |
315 | |
|
316 | 0 | int wp = 0; |
317 | 0 | size_t i0; |
318 | | |
319 | | // loop while there are eqs to collect |
320 | 0 | for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) { |
321 | 0 | simd16uint16 v(vals + i0); |
322 | 0 | simd16uint16 max2 = max_func<C>(v, thr16); |
323 | 0 | simd16uint16 gemask = (v == max2); |
324 | 0 | simd16uint16 eqmask = (v == thr16); |
325 | 0 | uint32_t bits = get_MSBs( |
326 | 0 | blendv(simd32uint8(eqmask), |
327 | 0 | simd32uint8(gemask), |
328 | 0 | simd32uint8(mixmask))); |
329 | 0 | bits ^= 0xAAAAAAAA; |
330 | | // bit 2*i : eq |
331 | | // bit 2*i + 1 : lt |
332 | |
|
333 | 0 | while (bits) { |
334 | 0 | int j = __builtin_ctz(bits) & (~1); |
335 | 0 | bool is_eq = (bits >> j) & 1; |
336 | 0 | bool is_lt = (bits >> j) & 2; |
337 | 0 | bits &= ~(3 << j); |
338 | 0 | j >>= 1; |
339 | |
|
340 | 0 | if (is_lt) { |
341 | 0 | vals[wp] = vals[i0 + j]; |
342 | 0 | ids[wp] = ids[i0 + j]; |
343 | 0 | wp++; |
344 | 0 | } else if (is_eq && n_eq > 0) { |
345 | 0 | vals[wp] = vals[i0 + j]; |
346 | 0 | ids[wp] = ids[i0 + j]; |
347 | 0 | wp++; |
348 | 0 | n_eq--; |
349 | 0 | } |
350 | 0 | } |
351 | 0 | } |
352 | | |
353 | | // handle remaining, only striclty lt ones. |
354 | 0 | for (; i0 + 15 < n; i0 += 16) { |
355 | 0 | simd16uint16 v(vals + i0); |
356 | 0 | simd16uint16 max2 = max_func<C>(v, thr16); |
357 | 0 | simd16uint16 gemask = (v == max2); |
358 | 0 | uint32_t bits = ~get_MSBs(simd32uint8(gemask)); |
359 | |
|
360 | 0 | while (bits) { |
361 | 0 | int j = __builtin_ctz(bits); |
362 | 0 | bits &= ~(3 << j); |
363 | 0 | j >>= 1; |
364 | |
|
365 | 0 | vals[wp] = vals[i0 + j]; |
366 | 0 | ids[wp] = ids[i0 + j]; |
367 | 0 | wp++; |
368 | 0 | } |
369 | 0 | } |
370 | | |
371 | | // end with scalar |
372 | 0 | for (int i = (n & ~15); i < n; i++) { |
373 | 0 | if (C::cmp(thresh, vals[i])) { |
374 | 0 | vals[wp] = vals[i]; |
375 | 0 | ids[wp] = ids[i]; |
376 | 0 | wp++; |
377 | 0 | } else if (vals[i] == thresh && n_eq > 0) { |
378 | 0 | vals[wp] = vals[i]; |
379 | 0 | ids[wp] = ids[i]; |
380 | 0 | wp++; |
381 | 0 | n_eq--; |
382 | 0 | } |
383 | 0 | } |
384 | 0 | assert(n_eq == 0); |
385 | 0 | return wp; |
386 | 0 | } Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMinItlEEEEiPtPNT_2TIEmti Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMaxItlEEEEiPtPNT_2TIEmti Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMinItiEEEEiPtPNT_2TIEmti Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMaxItiEEEEiPtPNT_2TIEmti Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMinIflEEEEiPtPNT_2TIEmti Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMaxIflEEEEiPtPNT_2TIEmti |
387 | | |
388 | | // #define MICRO_BENCHMARK |
389 | | |
390 | 0 | static uint64_t get_cy() { |
391 | | #ifdef MICRO_BENCHMARK |
392 | | uint32_t high, low; |
393 | | asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high)); |
394 | | return ((uint64_t)high << 32) | (low); |
395 | | #else |
396 | 0 | return 0; |
397 | 0 | #endif |
398 | 0 | } |
399 | | |
400 | 0 | #define IFV if (false) |
401 | | |
402 | | template <class C> |
403 | | uint16_t simd_partition_fuzzy_with_bounds( |
404 | | uint16_t* vals, |
405 | | typename C::TI* ids, |
406 | | size_t n, |
407 | | size_t q_min, |
408 | | size_t q_max, |
409 | | size_t* q_out, |
410 | | uint16_t s0i, |
411 | 0 | uint16_t s1i) { |
412 | 0 | if (q_min == 0) { |
413 | 0 | if (q_out) { |
414 | 0 | *q_out = 0; |
415 | 0 | } |
416 | 0 | return 0; |
417 | 0 | } |
418 | 0 | if (q_max >= n) { |
419 | 0 | if (q_out) { |
420 | 0 | *q_out = q_max; |
421 | 0 | } |
422 | 0 | return 0xffff; |
423 | 0 | } |
424 | 0 | if (s0i == s1i) { |
425 | 0 | if (q_out) { |
426 | 0 | *q_out = q_min; |
427 | 0 | } |
428 | 0 | return s0i; |
429 | 0 | } |
430 | 0 | uint64_t t0 = get_cy(); |
431 | | |
432 | | // lower bound inclusive, upper exclusive |
433 | 0 | size_t s0 = s0i, s1 = s1i + 1; |
434 | |
|
435 | 0 | IFV printf("bounds: %ld %ld\n", s0, s1 - 1); |
436 | |
|
437 | 0 | int thresh; |
438 | 0 | size_t n_eq = 0, n_lt = 0; |
439 | 0 | size_t q = 0; |
440 | |
|
441 | 0 | for (int it = 0; it < 200; it++) { |
442 | | // while(s0 + 1 < s1) { |
443 | 0 | thresh = (s0 + s1) / 2; |
444 | 0 | count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq); |
445 | |
|
446 | 0 | IFV printf( |
447 | 0 | " [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n", |
448 | 0 | s0, |
449 | 0 | s1, |
450 | 0 | thresh, |
451 | 0 | n_lt, |
452 | 0 | n_eq, |
453 | 0 | q_min, |
454 | 0 | q_max, |
455 | 0 | n); |
456 | 0 | if (n_lt <= q_min) { |
457 | 0 | if (n_lt + n_eq >= q_min) { |
458 | 0 | q = q_min; |
459 | 0 | break; |
460 | 0 | } else { |
461 | 0 | if (C::is_max) { |
462 | 0 | s0 = thresh; |
463 | 0 | } else { |
464 | 0 | s1 = thresh; |
465 | 0 | } |
466 | 0 | } |
467 | 0 | } else if (n_lt <= q_max) { |
468 | 0 | q = n_lt; |
469 | 0 | break; |
470 | 0 | } else { |
471 | 0 | if (C::is_max) { |
472 | 0 | s1 = thresh; |
473 | 0 | } else { |
474 | 0 | s0 = thresh; |
475 | 0 | } |
476 | 0 | } |
477 | 0 | } |
478 | |
|
479 | 0 | uint64_t t1 = get_cy(); |
480 | | |
481 | | // number of equal values to keep |
482 | 0 | int64_t n_eq_1 = q - n_lt; |
483 | |
|
484 | 0 | IFV printf("shrink: thresh=%d q=%ld n_eq_1=%ld\n", thresh, q, n_eq_1); |
485 | 0 | if (n_eq_1 < 0) { // happens when > q elements are at lower bound |
486 | 0 | assert(s0 + 1 == s1); |
487 | 0 | q = q_min; |
488 | 0 | if (C::is_max) { |
489 | 0 | thresh--; |
490 | 0 | } else { |
491 | 0 | thresh++; |
492 | 0 | } |
493 | 0 | n_eq_1 = q; |
494 | 0 | IFV printf(" override: thresh=%d n_eq_1=%ld\n", thresh, n_eq_1); |
495 | 0 | } else { |
496 | 0 | assert(n_eq_1 <= n_eq); |
497 | 0 | } |
498 | | |
499 | 0 | size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq_1); |
500 | |
|
501 | 0 | IFV printf("wp=%ld\n", wp); |
502 | 0 | assert(wp == q); |
503 | 0 | if (q_out) { |
504 | 0 | *q_out = q; |
505 | 0 | } |
506 | |
|
507 | 0 | uint64_t t2 = get_cy(); |
508 | |
|
509 | 0 | partition_stats.bissect_cycles += t1 - t0; |
510 | 0 | partition_stats.compress_cycles += t2 - t1; |
511 | |
|
512 | 0 | return thresh; |
513 | 0 | } Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMinItlEEEEtPtPNT_2TIEmmmPmtt Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMaxItlEEEEtPtPNT_2TIEmmmPmtt Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMinItiEEEEtPtPNT_2TIEmmmPmtt Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMaxItiEEEEtPtPNT_2TIEmmmPmtt Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMinIflEEEEtPtPNT_2TIEmmmPmtt Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMaxIflEEEEtPtPNT_2TIEmmmPmtt |
514 | | |
515 | | template <class C> |
516 | | uint16_t simd_partition_fuzzy_with_bounds_histogram( |
517 | | uint16_t* vals, |
518 | | typename C::TI* ids, |
519 | | size_t n, |
520 | | size_t q_min, |
521 | | size_t q_max, |
522 | | size_t* q_out, |
523 | | uint16_t s0i, |
524 | | uint16_t s1i) { |
525 | | if (q_min == 0) { |
526 | | if (q_out) { |
527 | | *q_out = 0; |
528 | | } |
529 | | return 0; |
530 | | } |
531 | | if (q_max >= n) { |
532 | | if (q_out) { |
533 | | *q_out = q_max; |
534 | | } |
535 | | return 0xffff; |
536 | | } |
537 | | if (s0i == s1i) { |
538 | | if (q_out) { |
539 | | *q_out = q_min; |
540 | | } |
541 | | return s0i; |
542 | | } |
543 | | |
544 | | IFV printf( |
545 | | "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n", |
546 | | q_min, |
547 | | q_max, |
548 | | n, |
549 | | s0i, |
550 | | s1i); |
551 | | |
552 | | if (!C::is_max) { |
553 | | IFV printf( |
554 | | "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max); |
555 | | q_min = n - q_min; |
556 | | q_max = n - q_max; |
557 | | } |
558 | | |
559 | | // lower and upper bound of range, inclusive |
560 | | int s0 = s0i, s1 = s1i; |
561 | | // number of values < s0 and > s1 |
562 | | size_t n_lt = 0, n_gt = 0; |
563 | | |
564 | | // output of loop: |
565 | | int thresh; // final threshold |
566 | | uint64_t tot_eq = 0; // total nb of equal values |
567 | | uint64_t n_eq = 0; // nb of equal values to keep |
568 | | size_t q; // final quantile |
569 | | |
570 | | // buffer for the histograms |
571 | | int hist[16]; |
572 | | |
573 | | for (int it = 0; it < 20; it++) { |
574 | | // otherwise we would be done already |
575 | | |
576 | | int shift = 0; |
577 | | |
578 | | IFV printf( |
579 | | " it %d bounds: %d %d n_lt=%ld n_gt=%ld\n", |
580 | | it, |
581 | | s0, |
582 | | s1, |
583 | | n_lt, |
584 | | n_gt); |
585 | | |
586 | | int maxval = s1 - s0; |
587 | | |
588 | | while (maxval > 15) { |
589 | | shift++; |
590 | | maxval >>= 1; |
591 | | } |
592 | | |
593 | | IFV printf( |
594 | | " histogram shift %d maxval %d ?= %d\n", |
595 | | shift, |
596 | | maxval, |
597 | | int((s1 - s0) >> shift)); |
598 | | |
599 | | if (maxval > 7) { |
600 | | simd_histogram_16(vals, n, s0, shift, hist); |
601 | | } else { |
602 | | simd_histogram_8(vals, n, s0, shift, hist); |
603 | | } |
604 | | IFV { |
605 | | int sum = n_lt + n_gt; |
606 | | printf(" n_lt=%ld hist=[", n_lt); |
607 | | for (int i = 0; i <= maxval; i++) { |
608 | | printf("%d ", hist[i]); |
609 | | sum += hist[i]; |
610 | | } |
611 | | printf("] n_gt=%ld sum=%d\n", n_gt, sum); |
612 | | assert(sum == n); |
613 | | } |
614 | | |
615 | | size_t sum_below = n_lt; |
616 | | int i; |
617 | | for (i = 0; i <= maxval; i++) { |
618 | | sum_below += hist[i]; |
619 | | if (sum_below >= q_min) { |
620 | | break; |
621 | | } |
622 | | } |
623 | | IFV printf(" i=%d sum_below=%ld\n", i, sum_below); |
624 | | if (i <= maxval) { |
625 | | s0 = s0 + (i << shift); |
626 | | s1 = s0 + (1 << shift) - 1; |
627 | | n_lt = sum_below - hist[i]; |
628 | | n_gt = n - sum_below; |
629 | | } else { |
630 | | assert(!"not implemented"); |
631 | | } |
632 | | |
633 | | IFV printf( |
634 | | " new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n", |
635 | | s0, |
636 | | s1, |
637 | | n_lt, |
638 | | n_gt); |
639 | | |
640 | | if (s1 > s0) { |
641 | | if (n_lt >= q_min && q_max >= n_lt) { |
642 | | IFV printf(" FOUND1\n"); |
643 | | thresh = s0; |
644 | | q = n_lt; |
645 | | break; |
646 | | } |
647 | | |
648 | | size_t n_lt_2 = n - n_gt; |
649 | | if (n_lt_2 >= q_min && q_max >= n_lt_2) { |
650 | | thresh = s1 + 1; |
651 | | q = n_lt_2; |
652 | | IFV printf(" FOUND2\n"); |
653 | | break; |
654 | | } |
655 | | } else { |
656 | | thresh = s0; |
657 | | q = q_min; |
658 | | tot_eq = n - n_gt - n_lt; |
659 | | n_eq = q_min - n_lt; |
660 | | IFV printf(" FOUND3\n"); |
661 | | break; |
662 | | } |
663 | | } |
664 | | |
665 | | IFV printf("end bissection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq); |
666 | | |
667 | | if (!C::is_max) { |
668 | | if (n_eq == 0) { |
669 | | thresh--; |
670 | | } else { |
671 | | // thresh unchanged |
672 | | n_eq = tot_eq - n_eq; |
673 | | } |
674 | | q = n - q; |
675 | | IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq); |
676 | | } |
677 | | |
678 | | size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq); |
679 | | IFV printf("wp=%ld ?= %ld\n", wp, q); |
680 | | assert(wp == q); |
681 | | if (q_out) { |
682 | | *q_out = wp; |
683 | | } |
684 | | |
685 | | return thresh; |
686 | | } |
687 | | |
688 | | template <class C> |
689 | | uint16_t simd_partition_fuzzy( |
690 | | uint16_t* vals, |
691 | | typename C::TI* ids, |
692 | | size_t n, |
693 | | size_t q_min, |
694 | | size_t q_max, |
695 | 0 | size_t* q_out) { |
696 | 0 | assert(is_aligned_pointer(vals)); |
697 | | |
698 | 0 | uint16_t s0i, s1i; |
699 | 0 | find_minimax(vals, n, s0i, s1i); |
700 | | // QSelect_stats.t0 += get_cy() - t0; |
701 | |
|
702 | 0 | return simd_partition_fuzzy_with_bounds<C>( |
703 | 0 | vals, ids, n, q_min, q_max, q_out, s0i, s1i); |
704 | 0 | } Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMinItlEEEEtPtPNT_2TIEmmmPm Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMaxItlEEEEtPtPNT_2TIEmmmPm Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMinItiEEEEtPtPNT_2TIEmmmPm Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMaxItiEEEEtPtPNT_2TIEmmmPm Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMinIflEEEEtPtPNT_2TIEmmmPm Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMaxIflEEEEtPtPNT_2TIEmmmPm |
705 | | |
706 | | template <class C> |
707 | | uint16_t simd_partition( |
708 | | uint16_t* vals, |
709 | | typename C::TI* ids, |
710 | | size_t n, |
711 | | size_t q) { |
712 | | assert(is_aligned_pointer(vals)); |
713 | | |
714 | | if (q == 0) { |
715 | | return 0; |
716 | | } |
717 | | if (q >= n) { |
718 | | return 0xffff; |
719 | | } |
720 | | |
721 | | uint16_t s0i, s1i; |
722 | | find_minimax(vals, n, s0i, s1i); |
723 | | |
724 | | return simd_partition_fuzzy_with_bounds<C>( |
725 | | vals, ids, n, q, q, nullptr, s0i, s1i); |
726 | | } |
727 | | |
728 | | template <class C> |
729 | | uint16_t simd_partition_with_bounds( |
730 | | uint16_t* vals, |
731 | | typename C::TI* ids, |
732 | | size_t n, |
733 | | size_t q, |
734 | | uint16_t s0i, |
735 | | uint16_t s1i) { |
736 | | return simd_partition_fuzzy_with_bounds<C>( |
737 | | vals, ids, n, q, q, nullptr, s0i, s1i); |
738 | | } |
739 | | |
740 | | } // namespace simd_partitioning |
741 | | |
742 | | /****************************************************************** |
743 | | * Driver routine |
744 | | ******************************************************************/ |
745 | | |
746 | | template <class C> |
747 | | typename C::T partition_fuzzy( |
748 | | typename C::T* vals, |
749 | | typename C::TI* ids, |
750 | | size_t n, |
751 | | size_t q_min, |
752 | | size_t q_max, |
753 | 0 | size_t* q_out) { |
754 | 0 | #ifdef __AVX2__ |
755 | 0 | constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value; |
756 | 0 | if (is_uint16 && is_aligned_pointer(vals)) { |
757 | 0 | return simd_partitioning::simd_partition_fuzzy<C>( |
758 | 0 | (uint16_t*)vals, ids, n, q_min, q_max, q_out); |
759 | 0 | } |
760 | 0 | #endif |
761 | 0 | return partitioning::partition_fuzzy_median3<C>( |
762 | 0 | vals, ids, n, q_min, q_max, q_out); |
763 | 0 | } Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMinIflEEEENT_1TEPS4_PNS3_2TIEmmmPm Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMaxIflEEEENT_1TEPS4_PNS3_2TIEmmmPm Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMinItlEEEENT_1TEPS4_PNS3_2TIEmmmPm Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMaxItlEEEENT_1TEPS4_PNS3_2TIEmmmPm Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMinItiEEEENT_1TEPS4_PNS3_2TIEmmmPm Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMaxItiEEEENT_1TEPS4_PNS3_2TIEmmmPm |
764 | | |
765 | | // explicit template instanciations |
766 | | |
767 | | template float partition_fuzzy<CMin<float, int64_t>>( |
768 | | float* vals, |
769 | | int64_t* ids, |
770 | | size_t n, |
771 | | size_t q_min, |
772 | | size_t q_max, |
773 | | size_t* q_out); |
774 | | |
775 | | template float partition_fuzzy<CMax<float, int64_t>>( |
776 | | float* vals, |
777 | | int64_t* ids, |
778 | | size_t n, |
779 | | size_t q_min, |
780 | | size_t q_max, |
781 | | size_t* q_out); |
782 | | |
783 | | template uint16_t partition_fuzzy<CMin<uint16_t, int64_t>>( |
784 | | uint16_t* vals, |
785 | | int64_t* ids, |
786 | | size_t n, |
787 | | size_t q_min, |
788 | | size_t q_max, |
789 | | size_t* q_out); |
790 | | |
791 | | template uint16_t partition_fuzzy<CMax<uint16_t, int64_t>>( |
792 | | uint16_t* vals, |
793 | | int64_t* ids, |
794 | | size_t n, |
795 | | size_t q_min, |
796 | | size_t q_max, |
797 | | size_t* q_out); |
798 | | |
799 | | template uint16_t partition_fuzzy<CMin<uint16_t, int>>( |
800 | | uint16_t* vals, |
801 | | int* ids, |
802 | | size_t n, |
803 | | size_t q_min, |
804 | | size_t q_max, |
805 | | size_t* q_out); |
806 | | |
807 | | template uint16_t partition_fuzzy<CMax<uint16_t, int>>( |
808 | | uint16_t* vals, |
809 | | int* ids, |
810 | | size_t n, |
811 | | size_t q_min, |
812 | | size_t q_max, |
813 | | size_t* q_out); |
814 | | |
815 | | /****************************************************************** |
816 | | * Histogram subroutines |
817 | | ******************************************************************/ |
818 | | |
819 | | #if defined(__AVX2__) || defined(__aarch64__) |
820 | | /// FIXME when MSB of uint16 is set |
821 | | // this code does not compile properly with GCC 7.4.0 |
822 | | |
823 | | namespace { |
824 | | |
825 | | /************************************************************ |
826 | | * 8 bins |
827 | | ************************************************************/ |
828 | | |
829 | 0 | simd32uint8 accu4to8(simd16uint16 a4) { |
830 | 0 | simd16uint16 mask4(0x0f0f); |
831 | |
|
832 | 0 | simd16uint16 a8_0 = a4 & mask4; |
833 | 0 | simd16uint16 a8_1 = (a4 >> 4) & mask4; |
834 | |
|
835 | 0 | return simd32uint8(hadd(a8_0, a8_1)); |
836 | 0 | } |
837 | | |
838 | 0 | simd16uint16 accu8to16(simd32uint8 a8) { |
839 | 0 | simd16uint16 mask8(0x00ff); |
840 | |
|
841 | 0 | simd16uint16 a8_0 = simd16uint16(a8) & mask8; |
842 | 0 | simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8; |
843 | |
|
844 | 0 | return hadd(a8_0, a8_1); |
845 | 0 | } |
846 | | |
847 | | static const simd32uint8 shifts = simd32uint8::create< |
848 | | 1, |
849 | | 16, |
850 | | 0, |
851 | | 0, |
852 | | 4, |
853 | | 64, |
854 | | 0, |
855 | | 0, |
856 | | 0, |
857 | | 0, |
858 | | 1, |
859 | | 16, |
860 | | 0, |
861 | | 0, |
862 | | 4, |
863 | | 64, |
864 | | 1, |
865 | | 16, |
866 | | 0, |
867 | | 0, |
868 | | 4, |
869 | | 64, |
870 | | 0, |
871 | | 0, |
872 | | 0, |
873 | | 0, |
874 | | 1, |
875 | | 16, |
876 | | 0, |
877 | | 0, |
878 | | 4, |
879 | | 64>(); |
880 | | |
881 | | // 2-bit accumulator: we can add only up to 3 elements |
882 | | // on output we return 2*4-bit results |
883 | | // preproc returns either an index in 0..7 or 0xffff |
884 | | // that yields a 0 when used in the table look-up |
885 | | template <int N, class Preproc> |
886 | | void compute_accu2( |
887 | | const uint16_t*& data, |
888 | | Preproc& pp, |
889 | | simd16uint16& a4lo, |
890 | 0 | simd16uint16& a4hi) { |
891 | 0 | simd16uint16 mask2(0x3333); |
892 | 0 | simd16uint16 a2((uint16_t)0); // 2-bit accu |
893 | 0 | for (int j = 0; j < N; j++) { |
894 | 0 | simd16uint16 v(data); |
895 | 0 | data += 16; |
896 | 0 | v = pp(v); |
897 | | // 0x800 -> force second half of table |
898 | 0 | simd16uint16 idx = v | (v << 8) | simd16uint16(0x800); |
899 | 0 | a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx))); |
900 | 0 | } |
901 | 0 | a4lo += a2 & mask2; |
902 | 0 | a4hi += (a2 >> 2) & mask2; |
903 | 0 | } Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_10PreprocNOPEEEvRPKtRT0_RNS_12simd16uint16ES9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_10PreprocNOPEEEvRPKtRT0_RNS_12simd16uint16ES9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_10PreprocNOPEEEvRPKtRT0_RNS_12simd16uint16ES9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi0ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi0ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi0ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi1ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi1ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi1ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi2ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi2ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi2ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi3ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi3ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi3ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi4ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi4ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi4ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi5ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi5ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi5ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi6ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi6ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi6ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi7ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi7ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi7ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi8ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi8ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi8ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi9ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi9ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi9ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi10ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi10ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi10ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi11ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi11ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi11ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi12ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi12ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi12ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi13ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi13ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi13ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_ |
904 | | |
905 | | template <class Preproc> |
906 | 0 | simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) { |
907 | 0 | assert(n_in % 16 == 0); |
908 | 0 | int n = n_in / 16; |
909 | |
|
910 | 0 | simd32uint8 a8lo(0); |
911 | 0 | simd32uint8 a8hi(0); |
912 | |
|
913 | 0 | for (int i0 = 0; i0 < n; i0 += 15) { |
914 | 0 | simd16uint16 a4lo(0); // 4-bit accus |
915 | 0 | simd16uint16 a4hi(0); |
916 | |
|
917 | 0 | int i1 = std::min(i0 + 15, n); |
918 | 0 | int i; |
919 | 0 | for (i = i0; i + 2 < i1; i += 3) { |
920 | 0 | compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max |
921 | 0 | } |
922 | 0 | switch (i1 - i) { |
923 | 0 | case 2: |
924 | 0 | compute_accu2<2>(data, pp, a4lo, a4hi); |
925 | 0 | break; |
926 | 0 | case 1: |
927 | 0 | compute_accu2<1>(data, pp, a4lo, a4hi); |
928 | 0 | break; |
929 | 0 | } |
930 | | |
931 | 0 | a8lo += accu4to8(a4lo); |
932 | 0 | a8hi += accu4to8(a4hi); |
933 | 0 | } |
934 | | |
935 | | // move to 16-bit accu |
936 | 0 | simd16uint16 a16lo = accu8to16(a8lo); |
937 | 0 | simd16uint16 a16hi = accu8to16(a8hi); |
938 | |
|
939 | 0 | simd16uint16 a16 = hadd(a16lo, a16hi); |
940 | | |
941 | | // the 2 lanes must still be combined |
942 | 0 | return a16; |
943 | 0 | } Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_10PreprocNOPEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi0ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi1ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi2ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi3ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi4ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi5ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi6ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi7ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi8ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi9ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi10ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi11ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi12ELi8EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi13ELi8EEEEENS_12simd16uint16EPKtT_m |
944 | | |
945 | | /************************************************************ |
946 | | * 16 bins |
947 | | ************************************************************/ |
948 | | |
949 | | static const simd32uint8 shifts2 = simd32uint8::create< |
950 | | 1, |
951 | | 2, |
952 | | 4, |
953 | | 8, |
954 | | 16, |
955 | | 32, |
956 | | 64, |
957 | | 128, |
958 | | 1, |
959 | | 2, |
960 | | 4, |
961 | | 8, |
962 | | 16, |
963 | | 32, |
964 | | 64, |
965 | | 128, |
966 | | 1, |
967 | | 2, |
968 | | 4, |
969 | | 8, |
970 | | 16, |
971 | | 32, |
972 | | 64, |
973 | | 128, |
974 | | 1, |
975 | | 2, |
976 | | 4, |
977 | | 8, |
978 | | 16, |
979 | | 32, |
980 | | 64, |
981 | | 128>(); |
982 | | |
983 | 0 | simd32uint8 shiftr_16(simd32uint8 x, int n) { |
984 | 0 | return simd32uint8(simd16uint16(x) >> n); |
985 | 0 | } |
986 | | |
987 | | // 2-bit accumulator: we can add only up to 3 elements |
988 | | // on output we return 2*4-bit results |
989 | | template <int N, class Preproc> |
990 | | void compute_accu2_16( |
991 | | const uint16_t*& data, |
992 | | Preproc pp, |
993 | | simd32uint8& a4_0, |
994 | | simd32uint8& a4_1, |
995 | | simd32uint8& a4_2, |
996 | 0 | simd32uint8& a4_3) { |
997 | 0 | simd32uint8 mask1(0x55); |
998 | 0 | simd32uint8 a2_0; // 2-bit accu |
999 | 0 | simd32uint8 a2_1; // 2-bit accu |
1000 | 0 | a2_0.clear(); |
1001 | 0 | a2_1.clear(); |
1002 | |
|
1003 | 0 | for (int j = 0; j < N; j++) { |
1004 | 0 | simd16uint16 v(data); |
1005 | 0 | data += 16; |
1006 | 0 | v = pp(v); |
1007 | |
|
1008 | 0 | simd16uint16 idx = v | (v << 8); |
1009 | 0 | simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx)); |
1010 | | // contains 0s for out-of-bounds elements |
1011 | |
|
1012 | 0 | simd16uint16 lt8 = (v >> 3) == simd16uint16(0); |
1013 | 0 | lt8 = lt8 ^ simd16uint16(0xff00); |
1014 | |
|
1015 | 0 | a1 = a1 & lt8; |
1016 | |
|
1017 | 0 | a2_0 += a1 & mask1; |
1018 | 0 | a2_1 += shiftr_16(a1, 1) & mask1; |
1019 | 0 | } |
1020 | 0 | simd32uint8 mask2(0x33); |
1021 | |
|
1022 | 0 | a4_0 += a2_0 & mask2; |
1023 | 0 | a4_1 += a2_1 & mask2; |
1024 | 0 | a4_2 += shiftr_16(a2_0, 2) & mask2; |
1025 | 0 | a4_3 += shiftr_16(a2_1, 2) & mask2; |
1026 | 0 | } Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_10PreprocNOPEEEvRPKtT0_RNS_11simd32uint8ES8_S8_S8_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_10PreprocNOPEEEvRPKtT0_RNS_11simd32uint8ES8_S8_S8_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_10PreprocNOPEEEvRPKtT0_RNS_11simd32uint8ES8_S8_S8_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi0ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi0ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi0ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi1ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi1ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi1ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi2ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi2ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi2ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi3ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi3ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi3ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi4ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi4ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi4ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi5ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi5ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi5ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi6ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi6ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi6ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi7ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi7ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi7ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi8ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi8ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi8ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi9ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi9ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi9ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi10ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi10ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi10ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi11ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi11ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi11ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi12ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi12ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi12ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_ |
1027 | | |
1028 | 0 | simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) { |
1029 | 0 | simd32uint8 mask4(0x0f); |
1030 | |
|
1031 | 0 | simd16uint16 a8_0 = combine2x2( |
1032 | 0 | (simd16uint16)(a4_0 & mask4), |
1033 | 0 | (simd16uint16)(shiftr_16(a4_0, 4) & mask4)); |
1034 | |
|
1035 | 0 | simd16uint16 a8_1 = combine2x2( |
1036 | 0 | (simd16uint16)(a4_1 & mask4), |
1037 | 0 | (simd16uint16)(shiftr_16(a4_1, 4) & mask4)); |
1038 | |
|
1039 | 0 | return simd32uint8(hadd(a8_0, a8_1)); |
1040 | 0 | } |
1041 | | |
1042 | | template <class Preproc> |
1043 | 0 | simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) { |
1044 | 0 | assert(n_in % 16 == 0); |
1045 | 0 | int n = n_in / 16; |
1046 | |
|
1047 | 0 | simd32uint8 a8lo((uint8_t)0); |
1048 | 0 | simd32uint8 a8hi((uint8_t)0); |
1049 | |
|
1050 | 0 | for (int i0 = 0; i0 < n; i0 += 7) { |
1051 | 0 | simd32uint8 a4_0(0); // 0, 4, 8, 12 |
1052 | 0 | simd32uint8 a4_1(0); // 1, 5, 9, 13 |
1053 | 0 | simd32uint8 a4_2(0); // 2, 6, 10, 14 |
1054 | 0 | simd32uint8 a4_3(0); // 3, 7, 11, 15 |
1055 | |
|
1056 | 0 | int i1 = std::min(i0 + 7, n); |
1057 | 0 | int i; |
1058 | 0 | for (i = i0; i + 2 < i1; i += 3) { |
1059 | 0 | compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3); |
1060 | 0 | } |
1061 | 0 | switch (i1 - i) { |
1062 | 0 | case 2: |
1063 | 0 | compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3); |
1064 | 0 | break; |
1065 | 0 | case 1: |
1066 | 0 | compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3); |
1067 | 0 | break; |
1068 | 0 | } |
1069 | | |
1070 | 0 | a8lo += accu4to8_2(a4_0, a4_1); |
1071 | 0 | a8hi += accu4to8_2(a4_2, a4_3); |
1072 | 0 | } |
1073 | | |
1074 | | // move to 16-bit accu |
1075 | 0 | simd16uint16 a16lo = accu8to16(a8lo); |
1076 | 0 | simd16uint16 a16hi = accu8to16(a8hi); |
1077 | |
|
1078 | 0 | simd16uint16 a16 = hadd(a16lo, a16hi); |
1079 | |
|
1080 | 0 | a16 = simd16uint16{simd8uint32{a16}.unzip()}; |
1081 | |
|
1082 | 0 | return a16; |
1083 | 0 | } Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_10PreprocNOPEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi0ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi1ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi2ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi3ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi4ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi5ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi6ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi7ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi8ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi9ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi10ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi11ELi16EEEEENS_12simd16uint16EPKtT_m Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi12ELi16EEEEENS_12simd16uint16EPKtT_m |
1084 | | |
1085 | | struct PreprocNOP { |
1086 | 0 | simd16uint16 operator()(simd16uint16 x) { |
1087 | 0 | return x; |
1088 | 0 | } |
1089 | | }; |
1090 | | |
1091 | | template <int shift, int nbin> |
1092 | | struct PreprocMinShift { |
1093 | | simd16uint16 min16; |
1094 | | simd16uint16 max16; |
1095 | | |
1096 | 0 | explicit PreprocMinShift(uint16_t min) { |
1097 | 0 | min16.set1(min); |
1098 | 0 | int vmax0 = std::min((nbin << shift) + min, 65536); |
1099 | 0 | uint16_t vmax = uint16_t(vmax0 - 1 - min); |
1100 | 0 | max16.set1(vmax); // vmax inclusive |
1101 | 0 | } Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi13ELi8EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi16EEC2Et Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi16EEC2Et |
1102 | | |
1103 | 0 | simd16uint16 operator()(simd16uint16 x) { |
1104 | 0 | x = x - min16; |
1105 | 0 | simd16uint16 mask = (x == max(x, max16)) - (x == max16); |
1106 | 0 | return (x >> shift) | mask; |
1107 | 0 | } Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi13ELi8EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi16EEclENS_12simd16uint16E Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi16EEclENS_12simd16uint16E |
1108 | | }; |
1109 | | |
1110 | | /* unbounded versions of the functions */ |
1111 | | |
1112 | 0 | void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) { |
1113 | 0 | PreprocNOP pp; |
1114 | 0 | simd16uint16 a16 = histogram_8(data, pp, (n & ~15)); |
1115 | |
|
1116 | 0 | ALIGNED(32) uint16_t a16_tab[16]; |
1117 | 0 | a16.store(a16_tab); |
1118 | |
|
1119 | 0 | for (int i = 0; i < 8; i++) { |
1120 | 0 | hist[i] = a16_tab[i] + a16_tab[i + 8]; |
1121 | 0 | } |
1122 | |
|
1123 | 0 | for (int i = (n & ~15); i < n; i++) { |
1124 | 0 | hist[data[i]]++; |
1125 | 0 | } |
1126 | 0 | } |
1127 | | |
1128 | 0 | void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) { |
1129 | 0 | simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15)); |
1130 | |
|
1131 | 0 | ALIGNED(32) uint16_t a16_tab[16]; |
1132 | 0 | a16.store(a16_tab); |
1133 | |
|
1134 | 0 | for (int i = 0; i < 16; i++) { |
1135 | 0 | hist[i] = a16_tab[i]; |
1136 | 0 | } |
1137 | |
|
1138 | 0 | for (int i = (n & ~15); i < n; i++) { |
1139 | 0 | hist[data[i]]++; |
1140 | 0 | } |
1141 | 0 | } |
1142 | | |
1143 | | } // anonymous namespace |
1144 | | |
1145 | | /************************************************************ |
1146 | | * Driver routines |
1147 | | ************************************************************/ |
1148 | | |
1149 | | void simd_histogram_8( |
1150 | | const uint16_t* data, |
1151 | | int n, |
1152 | | uint16_t min, |
1153 | | int shift, |
1154 | 0 | int* hist) { |
1155 | 0 | if (shift < 0) { |
1156 | 0 | simd_histogram_8_unbounded(data, n, hist); |
1157 | 0 | return; |
1158 | 0 | } |
1159 | | |
1160 | 0 | simd16uint16 a16; |
1161 | |
|
1162 | 0 | #define DISPATCH(s) \ |
1163 | 0 | case s: \ |
1164 | 0 | a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \ |
1165 | 0 | break |
1166 | |
|
1167 | 0 | switch (shift) { |
1168 | 0 | DISPATCH(0); |
1169 | 0 | DISPATCH(1); |
1170 | 0 | DISPATCH(2); |
1171 | 0 | DISPATCH(3); |
1172 | 0 | DISPATCH(4); |
1173 | 0 | DISPATCH(5); |
1174 | 0 | DISPATCH(6); |
1175 | 0 | DISPATCH(7); |
1176 | 0 | DISPATCH(8); |
1177 | 0 | DISPATCH(9); |
1178 | 0 | DISPATCH(10); |
1179 | 0 | DISPATCH(11); |
1180 | 0 | DISPATCH(12); |
1181 | 0 | DISPATCH(13); |
1182 | 0 | default: |
1183 | 0 | FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift); |
1184 | 0 | } |
1185 | 0 | #undef DISPATCH |
1186 | | |
1187 | 0 | ALIGNED(32) uint16_t a16_tab[16]; |
1188 | 0 | a16.store(a16_tab); |
1189 | |
|
1190 | 0 | for (int i = 0; i < 8; i++) { |
1191 | 0 | hist[i] = a16_tab[i] + a16_tab[i + 8]; |
1192 | 0 | } |
1193 | | |
1194 | | // complete with remaining bins |
1195 | 0 | for (int i = (n & ~15); i < n; i++) { |
1196 | 0 | if (data[i] < min) |
1197 | 0 | continue; |
1198 | 0 | uint16_t v = data[i] - min; |
1199 | 0 | v >>= shift; |
1200 | 0 | if (v < 8) |
1201 | 0 | hist[v]++; |
1202 | 0 | } |
1203 | 0 | } |
1204 | | |
1205 | | void simd_histogram_16( |
1206 | | const uint16_t* data, |
1207 | | int n, |
1208 | | uint16_t min, |
1209 | | int shift, |
1210 | 0 | int* hist) { |
1211 | 0 | if (shift < 0) { |
1212 | 0 | simd_histogram_16_unbounded(data, n, hist); |
1213 | 0 | return; |
1214 | 0 | } |
1215 | | |
1216 | 0 | simd16uint16 a16; |
1217 | |
|
1218 | 0 | #define DISPATCH(s) \ |
1219 | 0 | case s: \ |
1220 | 0 | a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \ |
1221 | 0 | break |
1222 | |
|
1223 | 0 | switch (shift) { |
1224 | 0 | DISPATCH(0); |
1225 | 0 | DISPATCH(1); |
1226 | 0 | DISPATCH(2); |
1227 | 0 | DISPATCH(3); |
1228 | 0 | DISPATCH(4); |
1229 | 0 | DISPATCH(5); |
1230 | 0 | DISPATCH(6); |
1231 | 0 | DISPATCH(7); |
1232 | 0 | DISPATCH(8); |
1233 | 0 | DISPATCH(9); |
1234 | 0 | DISPATCH(10); |
1235 | 0 | DISPATCH(11); |
1236 | 0 | DISPATCH(12); |
1237 | 0 | default: |
1238 | 0 | FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift); |
1239 | 0 | } |
1240 | 0 | #undef DISPATCH |
1241 | | |
1242 | 0 | ALIGNED(32) uint16_t a16_tab[16]; |
1243 | 0 | a16.store(a16_tab); |
1244 | |
|
1245 | 0 | for (int i = 0; i < 16; i++) { |
1246 | 0 | hist[i] = a16_tab[i]; |
1247 | 0 | } |
1248 | |
|
1249 | 0 | for (int i = (n & ~15); i < n; i++) { |
1250 | 0 | if (data[i] < min) |
1251 | 0 | continue; |
1252 | 0 | uint16_t v = data[i] - min; |
1253 | 0 | v >>= shift; |
1254 | 0 | if (v < 16) |
1255 | 0 | hist[v]++; |
1256 | 0 | } |
1257 | 0 | } |
1258 | | |
1259 | | // no AVX2 |
1260 | | #else |
1261 | | |
1262 | | void simd_histogram_16( |
1263 | | const uint16_t* data, |
1264 | | int n, |
1265 | | uint16_t min, |
1266 | | int shift, |
1267 | | int* hist) { |
1268 | | memset(hist, 0, sizeof(*hist) * 16); |
1269 | | if (shift < 0) { |
1270 | | for (size_t i = 0; i < n; i++) { |
1271 | | hist[data[i]]++; |
1272 | | } |
1273 | | } else { |
1274 | | int vmax0 = std::min((16 << shift) + min, 65536); |
1275 | | uint16_t vmax = uint16_t(vmax0 - 1 - min); |
1276 | | |
1277 | | for (size_t i = 0; i < n; i++) { |
1278 | | uint16_t v = data[i]; |
1279 | | v -= min; |
1280 | | if (!(v <= vmax)) |
1281 | | continue; |
1282 | | v >>= shift; |
1283 | | hist[v]++; |
1284 | | |
1285 | | /* |
1286 | | if (data[i] < min) continue; |
1287 | | uint16_t v = data[i] - min; |
1288 | | v >>= shift; |
1289 | | if (v < 16) hist[v]++; |
1290 | | */ |
1291 | | } |
1292 | | } |
1293 | | } |
1294 | | |
1295 | | void simd_histogram_8( |
1296 | | const uint16_t* data, |
1297 | | int n, |
1298 | | uint16_t min, |
1299 | | int shift, |
1300 | | int* hist) { |
1301 | | memset(hist, 0, sizeof(*hist) * 8); |
1302 | | if (shift < 0) { |
1303 | | for (size_t i = 0; i < n; i++) { |
1304 | | hist[data[i]]++; |
1305 | | } |
1306 | | } else { |
1307 | | for (size_t i = 0; i < n; i++) { |
1308 | | if (data[i] < min) |
1309 | | continue; |
1310 | | uint16_t v = data[i] - min; |
1311 | | v >>= shift; |
1312 | | if (v < 8) |
1313 | | hist[v]++; |
1314 | | } |
1315 | | } |
1316 | | } |
1317 | | |
1318 | | #endif |
1319 | | |
1320 | 1 | void PartitionStats::reset() { |
1321 | 1 | memset(this, 0, sizeof(*this)); |
1322 | 1 | } |
1323 | | |
1324 | | PartitionStats partition_stats; |
1325 | | |
1326 | | } // namespace faiss |