Coverage Report

Created: 2025-10-15 19:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/contrib/faiss/faiss/utils/partitioning.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#include <faiss/utils/partitioning.h>
9
10
#include <cassert>
11
#include <cmath>
12
13
#include <faiss/impl/FaissAssert.h>
14
#include <faiss/utils/AlignedTable.h>
15
#include <faiss/utils/ordered_key_value.h>
16
#include <faiss/utils/simdlib.h>
17
18
#include <faiss/impl/platform_macros.h>
19
20
namespace faiss {
21
22
/******************************************************************
23
 * Internal routines
24
 ******************************************************************/
25
26
namespace partitioning {
27
28
template <typename T>
29
0
T median3(T a, T b, T c) {
30
0
    if (a > b) {
31
0
        std::swap(a, b);
32
0
    }
33
0
    if (c > b) {
34
0
        return b;
35
0
    }
36
0
    if (c > a) {
37
0
        return c;
38
0
    }
39
0
    return a;
40
0
}
Unexecuted instantiation: _ZN5faiss12partitioning7median3IfEET_S2_S2_S2_
Unexecuted instantiation: _ZN5faiss12partitioning7median3ItEET_S2_S2_S2_
41
42
template <class C>
43
typename C::T sample_threshold_median3(
44
        const typename C::T* vals,
45
        int n,
46
        typename C::T thresh_inf,
47
0
        typename C::T thresh_sup) {
48
0
    using T = typename C::T;
49
0
    size_t big_prime = 6700417;
50
0
    T val3[3];
51
0
    int vi = 0;
52
53
0
    for (size_t i = 0; i < n; i++) {
54
0
        T v = vals[(i * big_prime) % n];
55
        // thresh_inf < v < thresh_sup (for CMax)
56
0
        if (C::cmp(v, thresh_inf) && C::cmp(thresh_sup, v)) {
57
0
            val3[vi++] = v;
58
0
            if (vi == 3) {
59
0
                break;
60
0
            }
61
0
        }
62
0
    }
63
64
0
    if (vi == 3) {
65
0
        return median3(val3[0], val3[1], val3[2]);
66
0
    } else if (vi != 0) {
67
0
        return val3[0];
68
0
    } else {
69
0
        return thresh_inf;
70
        //   FAISS_THROW_MSG("too few values to compute a median");
71
0
    }
72
0
}
Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMinIflEEEENT_1TEPKS5_iS5_S5_
Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMaxIflEEEENT_1TEPKS5_iS5_S5_
Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMinItlEEEENT_1TEPKS5_iS5_S5_
Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMaxItlEEEENT_1TEPKS5_iS5_S5_
Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMinItiEEEENT_1TEPKS5_iS5_S5_
Unexecuted instantiation: _ZN5faiss12partitioning24sample_threshold_median3INS_4CMaxItiEEEENT_1TEPKS5_iS5_S5_
73
74
template <class C>
75
void count_lt_and_eq(
76
        const typename C::T* vals,
77
        size_t n,
78
        typename C::T thresh,
79
        size_t& n_lt,
80
0
        size_t& n_eq) {
81
0
    n_lt = n_eq = 0;
82
83
0
    for (size_t i = 0; i < n; i++) {
84
0
        typename C::T v = *vals++;
85
0
        if (C::cmp(thresh, v)) {
86
0
            n_lt++;
87
0
        } else if (v == thresh) {
88
0
            n_eq++;
89
0
        }
90
0
    }
91
0
}
Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMinIflEEEEvPKNT_1TEmS5_RmS8_
Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMaxIflEEEEvPKNT_1TEmS5_RmS8_
Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMinItlEEEEvPKNT_1TEmS5_RmS8_
Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMaxItlEEEEvPKNT_1TEmS5_RmS8_
Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMinItiEEEEvPKNT_1TEmS5_RmS8_
Unexecuted instantiation: _ZN5faiss12partitioning15count_lt_and_eqINS_4CMaxItiEEEEvPKNT_1TEmS5_RmS8_
92
93
template <class C>
94
size_t compress_array(
95
        typename C::T* vals,
96
        typename C::TI* ids,
97
        size_t n,
98
        typename C::T thresh,
99
0
        size_t n_eq) {
100
0
    size_t wp = 0;
101
0
    for (size_t i = 0; i < n; i++) {
102
0
        if (C::cmp(thresh, vals[i])) {
103
0
            vals[wp] = vals[i];
104
0
            ids[wp] = ids[i];
105
0
            wp++;
106
0
        } else if (n_eq > 0 && vals[i] == thresh) {
107
0
            vals[wp] = vals[i];
108
0
            ids[wp] = ids[i];
109
0
            wp++;
110
0
            n_eq--;
111
0
        }
112
0
    }
113
0
    assert(n_eq == 0);
114
0
    return wp;
115
0
}
Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMinIflEEEEmPNT_1TEPNS4_2TIEmS5_m
Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMaxIflEEEEmPNT_1TEPNS4_2TIEmS5_m
Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMinItlEEEEmPNT_1TEPNS4_2TIEmS5_m
Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMaxItlEEEEmPNT_1TEPNS4_2TIEmS5_m
Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMinItiEEEEmPNT_1TEPNS4_2TIEmS5_m
Unexecuted instantiation: _ZN5faiss12partitioning14compress_arrayINS_4CMaxItiEEEEmPNT_1TEPNS4_2TIEmS5_m
116
117
0
#define IFV if (false)
118
119
template <class C>
120
typename C::T partition_fuzzy_median3(
121
        typename C::T* vals,
122
        typename C::TI* ids,
123
        size_t n,
124
        size_t q_min,
125
        size_t q_max,
126
0
        size_t* q_out) {
127
0
    if (q_min == 0) {
128
0
        if (q_out) {
129
0
            *q_out = C::Crev::neutral();
130
0
        }
131
0
        return 0;
132
0
    }
133
0
    if (q_max >= n) {
134
0
        if (q_out) {
135
0
            *q_out = q_max;
136
0
        }
137
0
        return C::neutral();
138
0
    }
139
140
0
    using T = typename C::T;
141
142
    // here we use bissection with a median of 3 to find the threshold and
143
    // compress the arrays afterwards. So it's a n*log(n) algoirithm rather than
144
    // qselect's O(n) but it avoids shuffling around the array.
145
146
0
    FAISS_THROW_IF_NOT(n >= 3);
147
148
0
    T thresh_inf = C::Crev::neutral();
149
0
    T thresh_sup = C::neutral();
150
0
    T thresh = median3(vals[0], vals[n / 2], vals[n - 1]);
151
152
0
    size_t n_eq = 0, n_lt = 0;
153
0
    size_t q = 0;
154
155
0
    for (int it = 0; it < 200; it++) {
156
0
        count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
157
158
0
        IFV printf(
159
0
                "   thresh=%g [%g %g] n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
160
0
                float(thresh),
161
0
                float(thresh_inf),
162
0
                float(thresh_sup),
163
0
                long(n_lt),
164
0
                long(n_eq),
165
0
                long(q_min),
166
0
                long(q_max),
167
0
                long(n));
168
169
0
        if (n_lt <= q_min) {
170
0
            if (n_lt + n_eq >= q_min) {
171
0
                q = q_min;
172
0
                break;
173
0
            } else {
174
0
                thresh_inf = thresh;
175
0
            }
176
0
        } else if (n_lt <= q_max) {
177
0
            q = n_lt;
178
0
            break;
179
0
        } else {
180
0
            thresh_sup = thresh;
181
0
        }
182
183
        // FIXME avoid a second pass over the array to sample the threshold
184
0
        IFV printf(
185
0
                "     sample thresh in [%g %g]\n",
186
0
                float(thresh_inf),
187
0
                float(thresh_sup));
188
0
        T new_thresh =
189
0
                sample_threshold_median3<C>(vals, n, thresh_inf, thresh_sup);
190
0
        if (new_thresh == thresh_inf) {
191
            // then there is nothing between thresh_inf and thresh_sup
192
0
            break;
193
0
        }
194
0
        thresh = new_thresh;
195
0
    }
196
197
0
    int64_t n_eq_1 = q - n_lt;
198
199
0
    IFV printf("shrink: thresh=%g n_eq_1=%ld\n", float(thresh), long(n_eq_1));
200
201
0
    if (n_eq_1 < 0) { // happens when > q elements are at lower bound
202
0
        q = q_min;
203
0
        thresh = C::Crev::nextafter(thresh);
204
0
        n_eq_1 = q;
205
0
    } else {
206
0
        assert(n_eq_1 <= n_eq);
207
0
    }
208
209
0
    [[maybe_unused]] const int wp =
210
0
            compress_array<C>(vals, ids, n, thresh, n_eq_1);
211
212
0
    assert(wp == q);
213
0
    if (q_out) {
214
0
        *q_out = q;
215
0
    }
216
217
0
    return thresh;
218
0
}
Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMinIflEEEENT_1TEPS5_PNS4_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMaxIflEEEENT_1TEPS5_PNS4_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMinItlEEEENT_1TEPS5_PNS4_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMaxItlEEEENT_1TEPS5_PNS4_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMinItiEEEENT_1TEPS5_PNS4_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss12partitioning23partition_fuzzy_median3INS_4CMaxItiEEEENT_1TEPS5_PNS4_2TIEmmmPm
219
220
} // namespace partitioning
221
222
/******************************************************************
223
 * SIMD routines when vals is an aligned array of uint16_t
224
 ******************************************************************/
225
226
namespace simd_partitioning {
227
228
void find_minimax(
229
        const uint16_t* vals,
230
        size_t n,
231
        uint16_t& smin,
232
0
        uint16_t& smax) {
233
0
    simd16uint16 vmin(0xffff), vmax(0);
234
0
    for (size_t i = 0; i + 15 < n; i += 16) {
235
0
        simd16uint16 v(vals + i);
236
0
        vmin.accu_min(v);
237
0
        vmax.accu_max(v);
238
0
    }
239
240
0
    ALIGNED(32) uint16_t tab32[32];
241
0
    vmin.store(tab32);
242
0
    vmax.store(tab32 + 16);
243
244
0
    smin = tab32[0], smax = tab32[16];
245
246
0
    for (int i = 1; i < 16; i++) {
247
0
        smin = std::min(smin, tab32[i]);
248
0
        smax = std::max(smax, tab32[i + 16]);
249
0
    }
250
251
    // missing values
252
0
    for (size_t i = (n & ~15); i < n; i++) {
253
0
        smin = std::min(smin, vals[i]);
254
0
        smax = std::max(smax, vals[i]);
255
0
    }
256
0
}
257
258
// max func differentiates between CMin and CMax (keep lowest or largest)
259
template <class C>
260
0
simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
261
0
    constexpr bool is_max = C::is_max;
262
0
    if (is_max) {
263
0
        return max(v, thr16);
264
0
    } else {
265
0
        return min(v, thr16);
266
0
    }
267
0
}
Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMinItlEEEENS_12simd16uint16ES4_S4_
Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMaxItlEEEENS_12simd16uint16ES4_S4_
Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMinItiEEEENS_12simd16uint16ES4_S4_
Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMaxItiEEEENS_12simd16uint16ES4_S4_
Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMinIflEEEENS_12simd16uint16ES4_S4_
Unexecuted instantiation: _ZN5faiss17simd_partitioning8max_funcINS_4CMaxIflEEEENS_12simd16uint16ES4_S4_
268
269
template <class C>
270
void count_lt_and_eq(
271
        const uint16_t* vals,
272
        int n,
273
        uint16_t thresh,
274
        size_t& n_lt,
275
0
        size_t& n_eq) {
276
0
    n_lt = n_eq = 0;
277
0
    simd16uint16 thr16(thresh);
278
279
0
    size_t n1 = n / 16;
280
281
0
    for (size_t i = 0; i < n1; i++) {
282
0
        simd16uint16 v(vals);
283
0
        vals += 16;
284
0
        simd16uint16 eqmask = (v == thr16);
285
0
        simd16uint16 max2 = max_func<C>(v, thr16);
286
0
        simd16uint16 gemask = (v == max2);
287
0
        uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask));
288
0
        int i_eq = __builtin_popcount(bits & 0x00ff00ff);
289
0
        int i_ge = __builtin_popcount(bits) - i_eq;
290
0
        n_eq += i_eq;
291
0
        n_lt += 16 - i_ge;
292
0
    }
293
294
0
    for (size_t i = n1 * 16; i < n; i++) {
295
0
        uint16_t v = *vals++;
296
0
        if (C::cmp(thresh, v)) {
297
0
            n_lt++;
298
0
        } else if (v == thresh) {
299
0
            n_eq++;
300
0
        }
301
0
    }
302
0
}
Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMinItlEEEEvPKtitRmS6_
Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMaxItlEEEEvPKtitRmS6_
Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMinItiEEEEvPKtitRmS6_
Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMaxItiEEEEvPKtitRmS6_
Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMinIflEEEEvPKtitRmS6_
Unexecuted instantiation: _ZN5faiss17simd_partitioning15count_lt_and_eqINS_4CMaxIflEEEEvPKtitRmS6_
303
304
/* compress separated values and ids table, keeping all values < thresh and at
305
 * most n_eq equal values */
306
template <class C>
307
int simd_compress_array(
308
        uint16_t* vals,
309
        typename C::TI* ids,
310
        size_t n,
311
        uint16_t thresh,
312
0
        int n_eq) {
313
0
    simd16uint16 thr16(thresh);
314
0
    simd16uint16 mixmask(0xff00);
315
316
0
    int wp = 0;
317
0
    size_t i0;
318
319
    // loop while there are eqs to collect
320
0
    for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) {
321
0
        simd16uint16 v(vals + i0);
322
0
        simd16uint16 max2 = max_func<C>(v, thr16);
323
0
        simd16uint16 gemask = (v == max2);
324
0
        simd16uint16 eqmask = (v == thr16);
325
0
        uint32_t bits = get_MSBs(
326
0
                blendv(simd32uint8(eqmask),
327
0
                       simd32uint8(gemask),
328
0
                       simd32uint8(mixmask)));
329
0
        bits ^= 0xAAAAAAAA;
330
        // bit 2*i     : eq
331
        // bit 2*i + 1 : lt
332
333
0
        while (bits) {
334
0
            int j = __builtin_ctz(bits) & (~1);
335
0
            bool is_eq = (bits >> j) & 1;
336
0
            bool is_lt = (bits >> j) & 2;
337
0
            bits &= ~(3 << j);
338
0
            j >>= 1;
339
340
0
            if (is_lt) {
341
0
                vals[wp] = vals[i0 + j];
342
0
                ids[wp] = ids[i0 + j];
343
0
                wp++;
344
0
            } else if (is_eq && n_eq > 0) {
345
0
                vals[wp] = vals[i0 + j];
346
0
                ids[wp] = ids[i0 + j];
347
0
                wp++;
348
0
                n_eq--;
349
0
            }
350
0
        }
351
0
    }
352
353
    // handle remaining, only striclty lt ones.
354
0
    for (; i0 + 15 < n; i0 += 16) {
355
0
        simd16uint16 v(vals + i0);
356
0
        simd16uint16 max2 = max_func<C>(v, thr16);
357
0
        simd16uint16 gemask = (v == max2);
358
0
        uint32_t bits = ~get_MSBs(simd32uint8(gemask));
359
360
0
        while (bits) {
361
0
            int j = __builtin_ctz(bits);
362
0
            bits &= ~(3 << j);
363
0
            j >>= 1;
364
365
0
            vals[wp] = vals[i0 + j];
366
0
            ids[wp] = ids[i0 + j];
367
0
            wp++;
368
0
        }
369
0
    }
370
371
    // end with scalar
372
0
    for (int i = (n & ~15); i < n; i++) {
373
0
        if (C::cmp(thresh, vals[i])) {
374
0
            vals[wp] = vals[i];
375
0
            ids[wp] = ids[i];
376
0
            wp++;
377
0
        } else if (vals[i] == thresh && n_eq > 0) {
378
0
            vals[wp] = vals[i];
379
0
            ids[wp] = ids[i];
380
0
            wp++;
381
0
            n_eq--;
382
0
        }
383
0
    }
384
0
    assert(n_eq == 0);
385
0
    return wp;
386
0
}
Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMinItlEEEEiPtPNT_2TIEmti
Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMaxItlEEEEiPtPNT_2TIEmti
Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMinItiEEEEiPtPNT_2TIEmti
Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMaxItiEEEEiPtPNT_2TIEmti
Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMinIflEEEEiPtPNT_2TIEmti
Unexecuted instantiation: _ZN5faiss17simd_partitioning19simd_compress_arrayINS_4CMaxIflEEEEiPtPNT_2TIEmti
387
388
// #define MICRO_BENCHMARK
389
390
0
static uint64_t get_cy() {
391
#ifdef MICRO_BENCHMARK
392
    uint32_t high, low;
393
    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
394
    return ((uint64_t)high << 32) | (low);
395
#else
396
0
    return 0;
397
0
#endif
398
0
}
399
400
0
#define IFV if (false)
401
402
template <class C>
403
uint16_t simd_partition_fuzzy_with_bounds(
404
        uint16_t* vals,
405
        typename C::TI* ids,
406
        size_t n,
407
        size_t q_min,
408
        size_t q_max,
409
        size_t* q_out,
410
        uint16_t s0i,
411
0
        uint16_t s1i) {
412
0
    if (q_min == 0) {
413
0
        if (q_out) {
414
0
            *q_out = 0;
415
0
        }
416
0
        return 0;
417
0
    }
418
0
    if (q_max >= n) {
419
0
        if (q_out) {
420
0
            *q_out = q_max;
421
0
        }
422
0
        return 0xffff;
423
0
    }
424
0
    if (s0i == s1i) {
425
0
        if (q_out) {
426
0
            *q_out = q_min;
427
0
        }
428
0
        return s0i;
429
0
    }
430
0
    uint64_t t0 = get_cy();
431
432
    // lower bound inclusive, upper exclusive
433
0
    size_t s0 = s0i, s1 = s1i + 1;
434
435
0
    IFV printf("bounds: %ld %ld\n", s0, s1 - 1);
436
437
0
    int thresh;
438
0
    size_t n_eq = 0, n_lt = 0;
439
0
    size_t q = 0;
440
441
0
    for (int it = 0; it < 200; it++) {
442
        // while(s0 + 1 < s1) {
443
0
        thresh = (s0 + s1) / 2;
444
0
        count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
445
446
0
        IFV printf(
447
0
                "   [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
448
0
                s0,
449
0
                s1,
450
0
                thresh,
451
0
                n_lt,
452
0
                n_eq,
453
0
                q_min,
454
0
                q_max,
455
0
                n);
456
0
        if (n_lt <= q_min) {
457
0
            if (n_lt + n_eq >= q_min) {
458
0
                q = q_min;
459
0
                break;
460
0
            } else {
461
0
                if (C::is_max) {
462
0
                    s0 = thresh;
463
0
                } else {
464
0
                    s1 = thresh;
465
0
                }
466
0
            }
467
0
        } else if (n_lt <= q_max) {
468
0
            q = n_lt;
469
0
            break;
470
0
        } else {
471
0
            if (C::is_max) {
472
0
                s1 = thresh;
473
0
            } else {
474
0
                s0 = thresh;
475
0
            }
476
0
        }
477
0
    }
478
479
0
    uint64_t t1 = get_cy();
480
481
    // number of equal values to keep
482
0
    int64_t n_eq_1 = q - n_lt;
483
484
0
    IFV printf("shrink: thresh=%d q=%ld n_eq_1=%ld\n", thresh, q, n_eq_1);
485
0
    if (n_eq_1 < 0) { // happens when > q elements are at lower bound
486
0
        assert(s0 + 1 == s1);
487
0
        q = q_min;
488
0
        if (C::is_max) {
489
0
            thresh--;
490
0
        } else {
491
0
            thresh++;
492
0
        }
493
0
        n_eq_1 = q;
494
0
        IFV printf("  override: thresh=%d n_eq_1=%ld\n", thresh, n_eq_1);
495
0
    } else {
496
0
        assert(n_eq_1 <= n_eq);
497
0
    }
498
499
0
    size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq_1);
500
501
0
    IFV printf("wp=%ld\n", wp);
502
0
    assert(wp == q);
503
0
    if (q_out) {
504
0
        *q_out = q;
505
0
    }
506
507
0
    uint64_t t2 = get_cy();
508
509
0
    partition_stats.bissect_cycles += t1 - t0;
510
0
    partition_stats.compress_cycles += t2 - t1;
511
512
0
    return thresh;
513
0
}
Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMinItlEEEEtPtPNT_2TIEmmmPmtt
Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMaxItlEEEEtPtPNT_2TIEmmmPmtt
Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMinItiEEEEtPtPNT_2TIEmmmPmtt
Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMaxItiEEEEtPtPNT_2TIEmmmPmtt
Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMinIflEEEEtPtPNT_2TIEmmmPmtt
Unexecuted instantiation: _ZN5faiss17simd_partitioning32simd_partition_fuzzy_with_boundsINS_4CMaxIflEEEEtPtPNT_2TIEmmmPmtt
514
515
template <class C>
516
uint16_t simd_partition_fuzzy_with_bounds_histogram(
517
        uint16_t* vals,
518
        typename C::TI* ids,
519
        size_t n,
520
        size_t q_min,
521
        size_t q_max,
522
        size_t* q_out,
523
        uint16_t s0i,
524
        uint16_t s1i) {
525
    if (q_min == 0) {
526
        if (q_out) {
527
            *q_out = 0;
528
        }
529
        return 0;
530
    }
531
    if (q_max >= n) {
532
        if (q_out) {
533
            *q_out = q_max;
534
        }
535
        return 0xffff;
536
    }
537
    if (s0i == s1i) {
538
        if (q_out) {
539
            *q_out = q_min;
540
        }
541
        return s0i;
542
    }
543
544
    IFV printf(
545
            "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
546
            q_min,
547
            q_max,
548
            n,
549
            s0i,
550
            s1i);
551
552
    if (!C::is_max) {
553
        IFV printf(
554
                "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
555
        q_min = n - q_min;
556
        q_max = n - q_max;
557
    }
558
559
    // lower and upper bound of range, inclusive
560
    int s0 = s0i, s1 = s1i;
561
    // number of values < s0 and > s1
562
    size_t n_lt = 0, n_gt = 0;
563
564
    // output of loop:
565
    int thresh;          // final threshold
566
    uint64_t tot_eq = 0; // total nb of equal values
567
    uint64_t n_eq = 0;   // nb of equal values to keep
568
    size_t q;            // final quantile
569
570
    // buffer for the histograms
571
    int hist[16];
572
573
    for (int it = 0; it < 20; it++) {
574
        // otherwise we would be done already
575
576
        int shift = 0;
577
578
        IFV printf(
579
                "  it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
580
                it,
581
                s0,
582
                s1,
583
                n_lt,
584
                n_gt);
585
586
        int maxval = s1 - s0;
587
588
        while (maxval > 15) {
589
            shift++;
590
            maxval >>= 1;
591
        }
592
593
        IFV printf(
594
                "    histogram shift %d maxval %d ?= %d\n",
595
                shift,
596
                maxval,
597
                int((s1 - s0) >> shift));
598
599
        if (maxval > 7) {
600
            simd_histogram_16(vals, n, s0, shift, hist);
601
        } else {
602
            simd_histogram_8(vals, n, s0, shift, hist);
603
        }
604
        IFV {
605
            int sum = n_lt + n_gt;
606
            printf("    n_lt=%ld hist=[", n_lt);
607
            for (int i = 0; i <= maxval; i++) {
608
                printf("%d ", hist[i]);
609
                sum += hist[i];
610
            }
611
            printf("] n_gt=%ld sum=%d\n", n_gt, sum);
612
            assert(sum == n);
613
        }
614
615
        size_t sum_below = n_lt;
616
        int i;
617
        for (i = 0; i <= maxval; i++) {
618
            sum_below += hist[i];
619
            if (sum_below >= q_min) {
620
                break;
621
            }
622
        }
623
        IFV printf("    i=%d sum_below=%ld\n", i, sum_below);
624
        if (i <= maxval) {
625
            s0 = s0 + (i << shift);
626
            s1 = s0 + (1 << shift) - 1;
627
            n_lt = sum_below - hist[i];
628
            n_gt = n - sum_below;
629
        } else {
630
            assert(!"not implemented");
631
        }
632
633
        IFV printf(
634
                "    new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n",
635
                s0,
636
                s1,
637
                n_lt,
638
                n_gt);
639
640
        if (s1 > s0) {
641
            if (n_lt >= q_min && q_max >= n_lt) {
642
                IFV printf("    FOUND1\n");
643
                thresh = s0;
644
                q = n_lt;
645
                break;
646
            }
647
648
            size_t n_lt_2 = n - n_gt;
649
            if (n_lt_2 >= q_min && q_max >= n_lt_2) {
650
                thresh = s1 + 1;
651
                q = n_lt_2;
652
                IFV printf("    FOUND2\n");
653
                break;
654
            }
655
        } else {
656
            thresh = s0;
657
            q = q_min;
658
            tot_eq = n - n_gt - n_lt;
659
            n_eq = q_min - n_lt;
660
            IFV printf("    FOUND3\n");
661
            break;
662
        }
663
    }
664
665
    IFV printf("end bissection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq);
666
667
    if (!C::is_max) {
668
        if (n_eq == 0) {
669
            thresh--;
670
        } else {
671
            // thresh unchanged
672
            n_eq = tot_eq - n_eq;
673
        }
674
        q = n - q;
675
        IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq);
676
    }
677
678
    size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq);
679
    IFV printf("wp=%ld ?= %ld\n", wp, q);
680
    assert(wp == q);
681
    if (q_out) {
682
        *q_out = wp;
683
    }
684
685
    return thresh;
686
}
687
688
template <class C>
689
uint16_t simd_partition_fuzzy(
690
        uint16_t* vals,
691
        typename C::TI* ids,
692
        size_t n,
693
        size_t q_min,
694
        size_t q_max,
695
0
        size_t* q_out) {
696
0
    assert(is_aligned_pointer(vals));
697
698
0
    uint16_t s0i, s1i;
699
0
    find_minimax(vals, n, s0i, s1i);
700
    // QSelect_stats.t0 += get_cy() - t0;
701
702
0
    return simd_partition_fuzzy_with_bounds<C>(
703
0
            vals, ids, n, q_min, q_max, q_out, s0i, s1i);
704
0
}
Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMinItlEEEEtPtPNT_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMaxItlEEEEtPtPNT_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMinItiEEEEtPtPNT_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMaxItiEEEEtPtPNT_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMinIflEEEEtPtPNT_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss17simd_partitioning20simd_partition_fuzzyINS_4CMaxIflEEEEtPtPNT_2TIEmmmPm
705
706
template <class C>
707
uint16_t simd_partition(
708
        uint16_t* vals,
709
        typename C::TI* ids,
710
        size_t n,
711
        size_t q) {
712
    assert(is_aligned_pointer(vals));
713
714
    if (q == 0) {
715
        return 0;
716
    }
717
    if (q >= n) {
718
        return 0xffff;
719
    }
720
721
    uint16_t s0i, s1i;
722
    find_minimax(vals, n, s0i, s1i);
723
724
    return simd_partition_fuzzy_with_bounds<C>(
725
            vals, ids, n, q, q, nullptr, s0i, s1i);
726
}
727
728
template <class C>
729
uint16_t simd_partition_with_bounds(
730
        uint16_t* vals,
731
        typename C::TI* ids,
732
        size_t n,
733
        size_t q,
734
        uint16_t s0i,
735
        uint16_t s1i) {
736
    return simd_partition_fuzzy_with_bounds<C>(
737
            vals, ids, n, q, q, nullptr, s0i, s1i);
738
}
739
740
} // namespace simd_partitioning
741
742
/******************************************************************
743
 * Driver routine
744
 ******************************************************************/
745
746
template <class C>
747
typename C::T partition_fuzzy(
748
        typename C::T* vals,
749
        typename C::TI* ids,
750
        size_t n,
751
        size_t q_min,
752
        size_t q_max,
753
0
        size_t* q_out) {
754
0
#ifdef __AVX2__
755
0
    constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value;
756
0
    if (is_uint16 && is_aligned_pointer(vals)) {
757
0
        return simd_partitioning::simd_partition_fuzzy<C>(
758
0
                (uint16_t*)vals, ids, n, q_min, q_max, q_out);
759
0
    }
760
0
#endif
761
0
    return partitioning::partition_fuzzy_median3<C>(
762
0
            vals, ids, n, q_min, q_max, q_out);
763
0
}
Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMinIflEEEENT_1TEPS4_PNS3_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMaxIflEEEENT_1TEPS4_PNS3_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMinItlEEEENT_1TEPS4_PNS3_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMaxItlEEEENT_1TEPS4_PNS3_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMinItiEEEENT_1TEPS4_PNS3_2TIEmmmPm
Unexecuted instantiation: _ZN5faiss15partition_fuzzyINS_4CMaxItiEEEENT_1TEPS4_PNS3_2TIEmmmPm
764
765
// explicit template instanciations
766
767
template float partition_fuzzy<CMin<float, int64_t>>(
768
        float* vals,
769
        int64_t* ids,
770
        size_t n,
771
        size_t q_min,
772
        size_t q_max,
773
        size_t* q_out);
774
775
template float partition_fuzzy<CMax<float, int64_t>>(
776
        float* vals,
777
        int64_t* ids,
778
        size_t n,
779
        size_t q_min,
780
        size_t q_max,
781
        size_t* q_out);
782
783
template uint16_t partition_fuzzy<CMin<uint16_t, int64_t>>(
784
        uint16_t* vals,
785
        int64_t* ids,
786
        size_t n,
787
        size_t q_min,
788
        size_t q_max,
789
        size_t* q_out);
790
791
template uint16_t partition_fuzzy<CMax<uint16_t, int64_t>>(
792
        uint16_t* vals,
793
        int64_t* ids,
794
        size_t n,
795
        size_t q_min,
796
        size_t q_max,
797
        size_t* q_out);
798
799
template uint16_t partition_fuzzy<CMin<uint16_t, int>>(
800
        uint16_t* vals,
801
        int* ids,
802
        size_t n,
803
        size_t q_min,
804
        size_t q_max,
805
        size_t* q_out);
806
807
template uint16_t partition_fuzzy<CMax<uint16_t, int>>(
808
        uint16_t* vals,
809
        int* ids,
810
        size_t n,
811
        size_t q_min,
812
        size_t q_max,
813
        size_t* q_out);
814
815
/******************************************************************
816
 * Histogram subroutines
817
 ******************************************************************/
818
819
#if defined(__AVX2__) || defined(__aarch64__)
820
/// FIXME when MSB of uint16 is set
821
// this code does not compile properly with GCC 7.4.0
822
823
namespace {
824
825
/************************************************************
826
 * 8 bins
827
 ************************************************************/
828
829
0
simd32uint8 accu4to8(simd16uint16 a4) {
830
0
    simd16uint16 mask4(0x0f0f);
831
832
0
    simd16uint16 a8_0 = a4 & mask4;
833
0
    simd16uint16 a8_1 = (a4 >> 4) & mask4;
834
835
0
    return simd32uint8(hadd(a8_0, a8_1));
836
0
}
837
838
0
simd16uint16 accu8to16(simd32uint8 a8) {
839
0
    simd16uint16 mask8(0x00ff);
840
841
0
    simd16uint16 a8_0 = simd16uint16(a8) & mask8;
842
0
    simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
843
844
0
    return hadd(a8_0, a8_1);
845
0
}
846
847
static const simd32uint8 shifts = simd32uint8::create<
848
        1,
849
        16,
850
        0,
851
        0,
852
        4,
853
        64,
854
        0,
855
        0,
856
        0,
857
        0,
858
        1,
859
        16,
860
        0,
861
        0,
862
        4,
863
        64,
864
        1,
865
        16,
866
        0,
867
        0,
868
        4,
869
        64,
870
        0,
871
        0,
872
        0,
873
        0,
874
        1,
875
        16,
876
        0,
877
        0,
878
        4,
879
        64>();
880
881
// 2-bit accumulator: we can add only up to 3 elements
882
// on output we return 2*4-bit results
883
// preproc returns either an index in 0..7 or 0xffff
884
// that yields a 0 when used in the table look-up
885
template <int N, class Preproc>
886
void compute_accu2(
887
        const uint16_t*& data,
888
        Preproc& pp,
889
        simd16uint16& a4lo,
890
0
        simd16uint16& a4hi) {
891
0
    simd16uint16 mask2(0x3333);
892
0
    simd16uint16 a2((uint16_t)0); // 2-bit accu
893
0
    for (int j = 0; j < N; j++) {
894
0
        simd16uint16 v(data);
895
0
        data += 16;
896
0
        v = pp(v);
897
        // 0x800 -> force second half of table
898
0
        simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
899
0
        a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx)));
900
0
    }
901
0
    a4lo += a2 & mask2;
902
0
    a4hi += (a2 >> 2) & mask2;
903
0
}
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_10PreprocNOPEEEvRPKtRT0_RNS_12simd16uint16ES9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_10PreprocNOPEEEvRPKtRT0_RNS_12simd16uint16ES9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_10PreprocNOPEEEvRPKtRT0_RNS_12simd16uint16ES9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi0ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi0ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi0ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi1ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi1ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi1ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi2ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi2ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi2ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi3ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi3ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi3ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi4ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi4ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi4ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi5ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi5ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi5ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi6ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi6ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi6ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi7ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi7ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi7ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi8ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi8ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi8ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi9ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi9ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi9ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi10ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi10ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi10ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi11ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi11ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi11ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi12ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi12ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi12ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi3ENS0_15PreprocMinShiftILi13ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi2ENS0_15PreprocMinShiftILi13ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_113compute_accu2ILi1ENS0_15PreprocMinShiftILi13ELi8EEEEEvRPKtRT0_RNS_12simd16uint16ESA_
904
905
template <class Preproc>
906
0
simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
907
0
    assert(n_in % 16 == 0);
908
0
    int n = n_in / 16;
909
910
0
    simd32uint8 a8lo(0);
911
0
    simd32uint8 a8hi(0);
912
913
0
    for (int i0 = 0; i0 < n; i0 += 15) {
914
0
        simd16uint16 a4lo(0); // 4-bit accus
915
0
        simd16uint16 a4hi(0);
916
917
0
        int i1 = std::min(i0 + 15, n);
918
0
        int i;
919
0
        for (i = i0; i + 2 < i1; i += 3) {
920
0
            compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max
921
0
        }
922
0
        switch (i1 - i) {
923
0
            case 2:
924
0
                compute_accu2<2>(data, pp, a4lo, a4hi);
925
0
                break;
926
0
            case 1:
927
0
                compute_accu2<1>(data, pp, a4lo, a4hi);
928
0
                break;
929
0
        }
930
931
0
        a8lo += accu4to8(a4lo);
932
0
        a8hi += accu4to8(a4hi);
933
0
    }
934
935
    // move to 16-bit accu
936
0
    simd16uint16 a16lo = accu8to16(a8lo);
937
0
    simd16uint16 a16hi = accu8to16(a8hi);
938
939
0
    simd16uint16 a16 = hadd(a16lo, a16hi);
940
941
    // the 2 lanes must still be combined
942
0
    return a16;
943
0
}
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_10PreprocNOPEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi0ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi1ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi2ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi3ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi4ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi5ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi6ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi7ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi8ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi9ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi10ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi11ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi12ELi8EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_111histogram_8INS0_15PreprocMinShiftILi13ELi8EEEEENS_12simd16uint16EPKtT_m
944
945
/************************************************************
946
 * 16 bins
947
 ************************************************************/
948
949
static const simd32uint8 shifts2 = simd32uint8::create<
950
        1,
951
        2,
952
        4,
953
        8,
954
        16,
955
        32,
956
        64,
957
        128,
958
        1,
959
        2,
960
        4,
961
        8,
962
        16,
963
        32,
964
        64,
965
        128,
966
        1,
967
        2,
968
        4,
969
        8,
970
        16,
971
        32,
972
        64,
973
        128,
974
        1,
975
        2,
976
        4,
977
        8,
978
        16,
979
        32,
980
        64,
981
        128>();
982
983
0
simd32uint8 shiftr_16(simd32uint8 x, int n) {
984
0
    return simd32uint8(simd16uint16(x) >> n);
985
0
}
986
987
// 2-bit accumulator: we can add only up to 3 elements
988
// on output we return 2*4-bit results
989
template <int N, class Preproc>
990
void compute_accu2_16(
991
        const uint16_t*& data,
992
        Preproc pp,
993
        simd32uint8& a4_0,
994
        simd32uint8& a4_1,
995
        simd32uint8& a4_2,
996
0
        simd32uint8& a4_3) {
997
0
    simd32uint8 mask1(0x55);
998
0
    simd32uint8 a2_0; // 2-bit accu
999
0
    simd32uint8 a2_1; // 2-bit accu
1000
0
    a2_0.clear();
1001
0
    a2_1.clear();
1002
1003
0
    for (int j = 0; j < N; j++) {
1004
0
        simd16uint16 v(data);
1005
0
        data += 16;
1006
0
        v = pp(v);
1007
1008
0
        simd16uint16 idx = v | (v << 8);
1009
0
        simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
1010
        // contains 0s for out-of-bounds elements
1011
1012
0
        simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
1013
0
        lt8 = lt8 ^ simd16uint16(0xff00);
1014
1015
0
        a1 = a1 & lt8;
1016
1017
0
        a2_0 += a1 & mask1;
1018
0
        a2_1 += shiftr_16(a1, 1) & mask1;
1019
0
    }
1020
0
    simd32uint8 mask2(0x33);
1021
1022
0
    a4_0 += a2_0 & mask2;
1023
0
    a4_1 += a2_1 & mask2;
1024
0
    a4_2 += shiftr_16(a2_0, 2) & mask2;
1025
0
    a4_3 += shiftr_16(a2_1, 2) & mask2;
1026
0
}
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_10PreprocNOPEEEvRPKtT0_RNS_11simd32uint8ES8_S8_S8_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_10PreprocNOPEEEvRPKtT0_RNS_11simd32uint8ES8_S8_S8_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_10PreprocNOPEEEvRPKtT0_RNS_11simd32uint8ES8_S8_S8_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi0ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi0ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi0ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi1ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi1ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi1ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi2ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi2ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi2ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi3ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi3ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi3ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi4ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi4ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi4ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi5ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi5ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi5ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi6ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi6ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi6ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi7ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi7ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi7ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi8ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi8ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi8ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi9ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi9ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi9ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi10ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi10ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi10ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi11ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi11ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi11ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi3ENS0_15PreprocMinShiftILi12ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi2ENS0_15PreprocMinShiftILi12ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_116compute_accu2_16ILi1ENS0_15PreprocMinShiftILi12ELi16EEEEEvRPKtT0_RNS_11simd32uint8ES9_S9_S9_
1027
1028
0
simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
1029
0
    simd32uint8 mask4(0x0f);
1030
1031
0
    simd16uint16 a8_0 = combine2x2(
1032
0
            (simd16uint16)(a4_0 & mask4),
1033
0
            (simd16uint16)(shiftr_16(a4_0, 4) & mask4));
1034
1035
0
    simd16uint16 a8_1 = combine2x2(
1036
0
            (simd16uint16)(a4_1 & mask4),
1037
0
            (simd16uint16)(shiftr_16(a4_1, 4) & mask4));
1038
1039
0
    return simd32uint8(hadd(a8_0, a8_1));
1040
0
}
1041
1042
template <class Preproc>
1043
0
simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
1044
0
    assert(n_in % 16 == 0);
1045
0
    int n = n_in / 16;
1046
1047
0
    simd32uint8 a8lo((uint8_t)0);
1048
0
    simd32uint8 a8hi((uint8_t)0);
1049
1050
0
    for (int i0 = 0; i0 < n; i0 += 7) {
1051
0
        simd32uint8 a4_0(0); // 0, 4, 8, 12
1052
0
        simd32uint8 a4_1(0); // 1, 5, 9, 13
1053
0
        simd32uint8 a4_2(0); // 2, 6, 10, 14
1054
0
        simd32uint8 a4_3(0); // 3, 7, 11, 15
1055
1056
0
        int i1 = std::min(i0 + 7, n);
1057
0
        int i;
1058
0
        for (i = i0; i + 2 < i1; i += 3) {
1059
0
            compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3);
1060
0
        }
1061
0
        switch (i1 - i) {
1062
0
            case 2:
1063
0
                compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
1064
0
                break;
1065
0
            case 1:
1066
0
                compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
1067
0
                break;
1068
0
        }
1069
1070
0
        a8lo += accu4to8_2(a4_0, a4_1);
1071
0
        a8hi += accu4to8_2(a4_2, a4_3);
1072
0
    }
1073
1074
    // move to 16-bit accu
1075
0
    simd16uint16 a16lo = accu8to16(a8lo);
1076
0
    simd16uint16 a16hi = accu8to16(a8hi);
1077
1078
0
    simd16uint16 a16 = hadd(a16lo, a16hi);
1079
1080
0
    a16 = simd16uint16{simd8uint32{a16}.unzip()};
1081
1082
0
    return a16;
1083
0
}
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_10PreprocNOPEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi0ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi1ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi2ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi3ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi4ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi5ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi6ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi7ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi8ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi9ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi10ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi11ELi16EEEEENS_12simd16uint16EPKtT_m
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_112histogram_16INS0_15PreprocMinShiftILi12ELi16EEEEENS_12simd16uint16EPKtT_m
1084
1085
struct PreprocNOP {
1086
0
    simd16uint16 operator()(simd16uint16 x) {
1087
0
        return x;
1088
0
    }
1089
};
1090
1091
template <int shift, int nbin>
1092
struct PreprocMinShift {
1093
    simd16uint16 min16;
1094
    simd16uint16 max16;
1095
1096
0
    explicit PreprocMinShift(uint16_t min) {
1097
0
        min16.set1(min);
1098
0
        int vmax0 = std::min((nbin << shift) + min, 65536);
1099
0
        uint16_t vmax = uint16_t(vmax0 - 1 - min);
1100
0
        max16.set1(vmax); // vmax inclusive
1101
0
    }
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi13ELi8EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi16EEC2Et
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi16EEC2Et
1102
1103
0
    simd16uint16 operator()(simd16uint16 x) {
1104
0
        x = x - min16;
1105
0
        simd16uint16 mask = (x == max(x, max16)) - (x == max16);
1106
0
        return (x >> shift) | mask;
1107
0
    }
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi13ELi8EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi0ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi1ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi2ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi3ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi4ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi5ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi6ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi7ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi8ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi9ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi10ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi11ELi16EEclENS_12simd16uint16E
Unexecuted instantiation: partitioning.cpp:_ZN5faiss12_GLOBAL__N_115PreprocMinShiftILi12ELi16EEclENS_12simd16uint16E
1108
};
1109
1110
/* unbounded versions of the functions */
1111
1112
0
void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) {
1113
0
    PreprocNOP pp;
1114
0
    simd16uint16 a16 = histogram_8(data, pp, (n & ~15));
1115
1116
0
    ALIGNED(32) uint16_t a16_tab[16];
1117
0
    a16.store(a16_tab);
1118
1119
0
    for (int i = 0; i < 8; i++) {
1120
0
        hist[i] = a16_tab[i] + a16_tab[i + 8];
1121
0
    }
1122
1123
0
    for (int i = (n & ~15); i < n; i++) {
1124
0
        hist[data[i]]++;
1125
0
    }
1126
0
}
1127
1128
0
void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) {
1129
0
    simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15));
1130
1131
0
    ALIGNED(32) uint16_t a16_tab[16];
1132
0
    a16.store(a16_tab);
1133
1134
0
    for (int i = 0; i < 16; i++) {
1135
0
        hist[i] = a16_tab[i];
1136
0
    }
1137
1138
0
    for (int i = (n & ~15); i < n; i++) {
1139
0
        hist[data[i]]++;
1140
0
    }
1141
0
}
1142
1143
} // anonymous namespace
1144
1145
/************************************************************
1146
 * Driver routines
1147
 ************************************************************/
1148
1149
void simd_histogram_8(
1150
        const uint16_t* data,
1151
        int n,
1152
        uint16_t min,
1153
        int shift,
1154
0
        int* hist) {
1155
0
    if (shift < 0) {
1156
0
        simd_histogram_8_unbounded(data, n, hist);
1157
0
        return;
1158
0
    }
1159
1160
0
    simd16uint16 a16;
1161
1162
0
#define DISPATCH(s)                                                     \
1163
0
    case s:                                                             \
1164
0
        a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \
1165
0
        break
1166
1167
0
    switch (shift) {
1168
0
        DISPATCH(0);
1169
0
        DISPATCH(1);
1170
0
        DISPATCH(2);
1171
0
        DISPATCH(3);
1172
0
        DISPATCH(4);
1173
0
        DISPATCH(5);
1174
0
        DISPATCH(6);
1175
0
        DISPATCH(7);
1176
0
        DISPATCH(8);
1177
0
        DISPATCH(9);
1178
0
        DISPATCH(10);
1179
0
        DISPATCH(11);
1180
0
        DISPATCH(12);
1181
0
        DISPATCH(13);
1182
0
        default:
1183
0
            FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
1184
0
    }
1185
0
#undef DISPATCH
1186
1187
0
    ALIGNED(32) uint16_t a16_tab[16];
1188
0
    a16.store(a16_tab);
1189
1190
0
    for (int i = 0; i < 8; i++) {
1191
0
        hist[i] = a16_tab[i] + a16_tab[i + 8];
1192
0
    }
1193
1194
    // complete with remaining bins
1195
0
    for (int i = (n & ~15); i < n; i++) {
1196
0
        if (data[i] < min)
1197
0
            continue;
1198
0
        uint16_t v = data[i] - min;
1199
0
        v >>= shift;
1200
0
        if (v < 8)
1201
0
            hist[v]++;
1202
0
    }
1203
0
}
1204
1205
void simd_histogram_16(
1206
        const uint16_t* data,
1207
        int n,
1208
        uint16_t min,
1209
        int shift,
1210
0
        int* hist) {
1211
0
    if (shift < 0) {
1212
0
        simd_histogram_16_unbounded(data, n, hist);
1213
0
        return;
1214
0
    }
1215
1216
0
    simd16uint16 a16;
1217
1218
0
#define DISPATCH(s)                                                       \
1219
0
    case s:                                                               \
1220
0
        a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \
1221
0
        break
1222
1223
0
    switch (shift) {
1224
0
        DISPATCH(0);
1225
0
        DISPATCH(1);
1226
0
        DISPATCH(2);
1227
0
        DISPATCH(3);
1228
0
        DISPATCH(4);
1229
0
        DISPATCH(5);
1230
0
        DISPATCH(6);
1231
0
        DISPATCH(7);
1232
0
        DISPATCH(8);
1233
0
        DISPATCH(9);
1234
0
        DISPATCH(10);
1235
0
        DISPATCH(11);
1236
0
        DISPATCH(12);
1237
0
        default:
1238
0
            FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
1239
0
    }
1240
0
#undef DISPATCH
1241
1242
0
    ALIGNED(32) uint16_t a16_tab[16];
1243
0
    a16.store(a16_tab);
1244
1245
0
    for (int i = 0; i < 16; i++) {
1246
0
        hist[i] = a16_tab[i];
1247
0
    }
1248
1249
0
    for (int i = (n & ~15); i < n; i++) {
1250
0
        if (data[i] < min)
1251
0
            continue;
1252
0
        uint16_t v = data[i] - min;
1253
0
        v >>= shift;
1254
0
        if (v < 16)
1255
0
            hist[v]++;
1256
0
    }
1257
0
}
1258
1259
// no AVX2
1260
#else
1261
1262
void simd_histogram_16(
1263
        const uint16_t* data,
1264
        int n,
1265
        uint16_t min,
1266
        int shift,
1267
        int* hist) {
1268
    memset(hist, 0, sizeof(*hist) * 16);
1269
    if (shift < 0) {
1270
        for (size_t i = 0; i < n; i++) {
1271
            hist[data[i]]++;
1272
        }
1273
    } else {
1274
        int vmax0 = std::min((16 << shift) + min, 65536);
1275
        uint16_t vmax = uint16_t(vmax0 - 1 - min);
1276
1277
        for (size_t i = 0; i < n; i++) {
1278
            uint16_t v = data[i];
1279
            v -= min;
1280
            if (!(v <= vmax))
1281
                continue;
1282
            v >>= shift;
1283
            hist[v]++;
1284
1285
            /*
1286
            if (data[i] < min) continue;
1287
            uint16_t v = data[i] - min;
1288
            v >>= shift;
1289
            if (v < 16) hist[v]++;
1290
            */
1291
        }
1292
    }
1293
}
1294
1295
void simd_histogram_8(
1296
        const uint16_t* data,
1297
        int n,
1298
        uint16_t min,
1299
        int shift,
1300
        int* hist) {
1301
    memset(hist, 0, sizeof(*hist) * 8);
1302
    if (shift < 0) {
1303
        for (size_t i = 0; i < n; i++) {
1304
            hist[data[i]]++;
1305
        }
1306
    } else {
1307
        for (size_t i = 0; i < n; i++) {
1308
            if (data[i] < min)
1309
                continue;
1310
            uint16_t v = data[i] - min;
1311
            v >>= shift;
1312
            if (v < 8)
1313
                hist[v]++;
1314
        }
1315
    }
1316
}
1317
1318
#endif
1319
1320
1
void PartitionStats::reset() {
1321
1
    memset(this, 0, sizeof(*this));
1322
1
}
1323
1324
PartitionStats partition_stats;
1325
1326
} // namespace faiss