Coverage Report

Created: 2026-03-19 14:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
contrib/faiss/faiss/utils/distances_fused/simdlib_based.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
// -*- c++ -*-
9
10
#include <faiss/utils/distances_fused/simdlib_based.h>
11
12
#if defined(__AVX2__) || defined(__aarch64__)
13
14
#include <faiss/utils/simdlib.h>
15
16
#if defined(__AVX2__)
17
#include <immintrin.h>
18
#endif
19
20
namespace faiss {
21
22
namespace {
23
24
// It makes sense to like to overload certain cases because the further
25
// kernels are in need of registers. So, let's tell compiler
26
// not to waste registers on a bit faster code, if needed.
27
template <size_t DIM>
28
0
float l2_sqr(const float* const x) {
29
    // compiler should be smart enough to handle that
30
0
    float output = x[0] * x[0];
31
0
    for (size_t i = 1; i < DIM; i++) {
32
0
        output += x[i] * x[i];
33
0
    }
34
35
0
    return output;
36
0
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm1EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm2EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm3EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm4EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm5EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm6EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm7EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm8EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm9EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm10EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm11EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm12EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm13EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm14EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm15EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm16EEEfPKf
37
38
template <size_t DIM>
39
float dot_product(
40
        const float* const __restrict x,
41
0
        const float* const __restrict y) {
42
    // compiler should be smart enough to handle that
43
0
    float output = x[0] * y[0];
44
0
    for (size_t i = 1; i < DIM; i++) {
45
0
        output += x[i] * y[i];
46
0
    }
47
48
0
    return output;
49
0
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm1EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm2EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm3EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm4EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm5EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm6EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm7EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm8EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm9EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm10EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm11EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm12EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm13EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm14EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm15EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm16EEEfPKfS3_
50
51
// The kernel for low dimensionality vectors.
52
// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
53
//
54
// DIM is the dimensionality of the data
55
// NX_POINTS_PER_LOOP is the number of x points that get processed
56
//   simultaneously.
57
// NY_POINTS_PER_LOOP is the number of y points that get processed
58
//   simultaneously.
59
template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
60
void kernel(
61
        const float* const __restrict x,
62
        const float* const __restrict y,
63
        const float* const __restrict y_transposed,
64
        const size_t ny,
65
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
66
        const float* __restrict y_norms,
67
0
        const size_t i) {
68
0
    const size_t ny_p =
69
0
            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
71
    // compute
72
0
    const float* const __restrict xd_0 = x + i * DIM;
73
74
    // prefetch the next point
75
0
#if defined(__AVX2__)
76
0
    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
0
#endif
78
79
    // load a single point from x
80
    // load -2 * value
81
0
    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
0
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
0
        for (size_t dd = 0; dd < DIM; dd++) {
84
0
            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
0
        }
86
0
    }
87
88
    // compute x_norm
89
0
    float x_norm_i[NX_POINTS_PER_LOOP];
90
0
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
0
        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
0
    }
93
94
    // distances and indices
95
0
    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
0
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
0
        min_distances_i[nx_k] =
98
0
                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
0
    }
100
101
0
    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
0
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
0
        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
0
    }
105
106
    //
107
0
    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
0
    const simd8uint32 indices_delta = simd8uint32(8);
109
110
    // main loop
111
0
    size_t j = 0;
112
0
    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
        // compute dot products for NX_POINTS from x and NY_POINTS from y
114
        // technically, we're multiplying -2x and y
115
0
        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
117
        // DIM 0 that uses MUL
118
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
0
            simd8float32 y_i =
120
0
                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
0
                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
0
            }
124
0
        }
125
126
        // other DIMs that use FMA
127
0
        for (size_t dd = 1; dd < DIM; dd++) {
128
0
            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
0
                simd8float32 y_i =
130
0
                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
132
0
                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
0
                    dp_i[nx_k][ny_k] =
134
0
                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
0
                }
136
0
            }
137
0
        }
138
139
        // compute y^2 + (-2x,y)
140
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
0
            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
143
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
0
                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
0
            }
146
0
        }
147
148
        // do the comparisons and alter the min indices
149
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
                // cmpps
152
0
                cmplt_and_blend_inplace(
153
0
                        dp_i[nx_k][ny_k],
154
0
                        current_indices,
155
0
                        min_distances_i[nx_k],
156
0
                        min_indices_i[nx_k]);
157
0
            }
158
159
0
            current_indices = current_indices + indices_delta;
160
0
        }
161
0
    }
162
163
    // dump values and find the minimum distance / minimum index
164
0
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
0
        float min_distances_scalar[8];
166
0
        uint32_t min_indices_scalar[8];
167
168
0
        min_distances_i[nx_k].storeu(min_distances_scalar);
169
0
        min_indices_i[nx_k].storeu(min_indices_scalar);
170
171
0
        float current_min_distance = res.dis_tab[i + nx_k];
172
0
        uint32_t current_min_index = res.ids_tab[i + nx_k];
173
174
        // This unusual comparison is needed to maintain the behavior
175
        // of the original implementation: if two indices are
176
        // represented with equal distance values, then
177
        // the index with the min value is returned.
178
0
        for (size_t jv = 0; jv < 8; jv++) {
179
            // add missing x_norms[i]
180
0
            float distance_candidate =
181
0
                    min_distances_scalar[jv] + x_norm_i[nx_k];
182
183
            // negative values can occur for identical vectors
184
            //    due to roundoff errors.
185
0
            if (distance_candidate < 0) {
186
0
                distance_candidate = 0;
187
0
            }
188
189
0
            const int64_t index_candidate = min_indices_scalar[jv];
190
191
0
            if (current_min_distance > distance_candidate) {
192
0
                current_min_distance = distance_candidate;
193
0
                current_min_index = index_candidate;
194
0
            } else if (
195
0
                    current_min_distance == distance_candidate &&
196
0
                    current_min_index > index_candidate) {
197
0
                current_min_index = index_candidate;
198
0
            }
199
0
        }
200
201
        // process leftovers
202
0
        for (size_t j0 = j; j0 < ny; j0++) {
203
0
            const float dp =
204
0
                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
0
            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
            // negative values can occur for identical vectors
207
            //    due to roundoff errors.
208
0
            if (dis < 0) {
209
0
                dis = 0;
210
0
            }
211
212
0
            if (current_min_distance > dis) {
213
0
                current_min_distance = dis;
214
0
                current_min_index = j0;
215
0
            }
216
0
        }
217
218
        // done
219
0
        res.add_result(i + nx_k, current_min_distance, current_min_index);
220
0
    }
221
0
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
222
223
template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
224
void exhaustive_L2sqr_fused_cmax(
225
        const float* const __restrict x,
226
        const float* const __restrict y,
227
        size_t nx,
228
        size_t ny,
229
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
230
0
        const float* __restrict y_norms) {
231
    // BLAS does not like empty matrices
232
0
    if (nx == 0 || ny == 0) {
233
0
        return;
234
0
    }
235
236
    // compute norms for y
237
0
    std::unique_ptr<float[]> del2;
238
0
    if (!y_norms) {
239
0
        float* y_norms2 = new float[ny];
240
0
        del2.reset(y_norms2);
241
242
0
        for (size_t i = 0; i < ny; i++) {
243
0
            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
244
0
        }
245
246
0
        y_norms = y_norms2;
247
0
    }
248
249
    // initialize res
250
0
    res.begin_multiple(0, nx);
251
252
    // transpose y
253
0
    std::vector<float> y_transposed(DIM * ny);
254
0
    for (size_t j = 0; j < DIM; j++) {
255
0
        for (size_t i = 0; i < ny; i++) {
256
0
            y_transposed[j * ny + i] = y[j + i * DIM];
257
0
        }
258
0
    }
259
260
0
    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
261
    // the main loop.
262
0
#pragma omp parallel for schedule(dynamic)
263
0
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
0
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
0
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
0
    }
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
267
268
0
    for (size_t i = nx_p; i < nx; i++) {
269
0
        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
270
0
                x, y, y_transposed.data(), ny, res, y_norms, i);
271
0
    }
272
273
    // Does nothing for Top1BlockResultHandler, but
274
    // keeping the call for the consistency.
275
0
    res.end_multiple();
276
0
    InterruptCallback::check();
277
0
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
278
279
} // namespace
280
281
bool exhaustive_L2sqr_fused_cmax_simdlib(
282
        const float* x,
283
        const float* y,
284
        size_t d,
285
        size_t nx,
286
        size_t ny,
287
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
288
157
        const float* y_norms) {
289
    // Process only cases with certain dimensionalities.
290
    // An acceptable dimensionality value is limited by the number of
291
    // available registers.
292
293
157
#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
294
157
    case DIM: {                                                  \
295
0
        exhaustive_L2sqr_fused_cmax<                             \
296
0
                DIM,                                             \
297
0
                NX_POINTS_PER_LOOP,                              \
298
0
                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
299
0
        return true;                                             \
300
0
    }
301
302
    // faiss/benchs/bench_quantizer.py was used for benchmarking
303
    // and tuning 2nd and 3rd parameters values.
304
    // Basically, the larger the values for 2nd and 3rd parameters are,
305
    // the faster the execution is, but the more SIMD registers are needed.
306
    // This can be compensated with L1 cache, this is why this
307
    // code might operate with more registers than available
308
    // because of concurrent ports operations for ALU and LOAD/STORE.
309
310
157
#if defined(__AVX2__)
311
    // It was possible to tweak these parameters on x64 machine.
312
157
    switch (d) {
313
0
        DISPATCH(1, 6, 1)
314
0
        DISPATCH(2, 6, 1)
315
0
        DISPATCH(3, 6, 1)
316
0
        DISPATCH(4, 8, 1)
317
0
        DISPATCH(5, 8, 1)
318
0
        DISPATCH(6, 8, 1)
319
0
        DISPATCH(7, 8, 1)
320
0
        DISPATCH(8, 8, 1)
321
0
        DISPATCH(9, 8, 1)
322
0
        DISPATCH(10, 8, 1)
323
0
        DISPATCH(11, 8, 1)
324
0
        DISPATCH(12, 8, 1)
325
0
        DISPATCH(13, 6, 1)
326
0
        DISPATCH(14, 6, 1)
327
0
        DISPATCH(15, 6, 1)
328
157
        DISPATCH(16, 6, 1)
329
157
    }
330
#else
331
    // Please feel free to alter 2nd and 3rd parameters if you have access
332
    // to ARM-based machine so that you are able to benchmark this code.
333
    // Or to enable other dimensions.
334
    switch (d) {
335
        DISPATCH(1, 4, 2)
336
        DISPATCH(2, 2, 2)
337
        DISPATCH(3, 2, 2)
338
        DISPATCH(4, 2, 1)
339
        DISPATCH(5, 1, 1)
340
        DISPATCH(6, 1, 1)
341
        DISPATCH(7, 1, 1)
342
        DISPATCH(8, 1, 1)
343
    }
344
#endif
345
346
157
    return false;
347
157
#undef DISPATCH
348
157
}
349
350
} // namespace faiss
351
352
#endif