Coverage Report

Created: 2026-05-14 22:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
contrib/faiss/faiss/utils/distances_fused/simdlib_based.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
// -*- c++ -*-
9
10
#include <faiss/utils/distances_fused/simdlib_based.h>
11
12
#if defined(__AVX2__) || defined(__aarch64__)
13
14
#include <faiss/utils/simdlib.h>
15
16
#if defined(__AVX2__)
17
#include <immintrin.h>
18
#endif
19
20
namespace faiss {
21
22
namespace {
23
24
// It makes sense to like to overload certain cases because the further
25
// kernels are in need of registers. So, let's tell compiler
26
// not to waste registers on a bit faster code, if needed.
27
template <size_t DIM>
28
102M
float l2_sqr(const float* const x) {
29
    // compiler should be smart enough to handle that
30
102M
    float output = x[0] * x[0];
31
816M
    for (size_t i = 1; i < DIM; i++) {
32
714M
        output += x[i] * x[i];
33
714M
    }
34
35
102M
    return output;
36
102M
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm1EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm2EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm3EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm4EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm5EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm6EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm7EEEfPKf
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm8EEEfPKf
Line
Count
Source
28
102M
float l2_sqr(const float* const x) {
29
    // compiler should be smart enough to handle that
30
102M
    float output = x[0] * x[0];
31
816M
    for (size_t i = 1; i < DIM; i++) {
32
714M
        output += x[i] * x[i];
33
714M
    }
34
35
102M
    return output;
36
102M
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm9EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm10EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm11EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm12EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm13EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm14EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm15EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm16EEEfPKf
37
38
template <size_t DIM>
39
float dot_product(
40
        const float* const __restrict x,
41
0
        const float* const __restrict y) {
42
    // compiler should be smart enough to handle that
43
0
    float output = x[0] * y[0];
44
0
    for (size_t i = 1; i < DIM; i++) {
45
0
        output += x[i] * y[i];
46
0
    }
47
48
0
    return output;
49
0
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm1EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm2EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm3EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm4EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm5EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm6EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm7EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm8EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm9EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm10EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm11EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm12EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm13EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm14EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm15EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm16EEEfPKfS3_
50
51
// The kernel for low dimensionality vectors.
52
// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
53
//
54
// DIM is the dimensionality of the data
55
// NX_POINTS_PER_LOOP is the number of x points that get processed
56
//   simultaneously.
57
// NY_POINTS_PER_LOOP is the number of y points that get processed
58
//   simultaneously.
59
template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
60
void kernel(
61
        const float* const __restrict x,
62
        const float* const __restrict y,
63
        const float* const __restrict y_transposed,
64
        const size_t ny,
65
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
66
        const float* __restrict y_norms,
67
12.6M
        const size_t i) {
68
12.6M
    const size_t ny_p =
69
12.6M
            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
71
    // compute
72
12.6M
    const float* const __restrict xd_0 = x + i * DIM;
73
74
    // prefetch the next point
75
12.6M
#if defined(__AVX2__)
76
12.6M
    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
12.6M
#endif
78
79
    // load a single point from x
80
    // load -2 * value
81
12.6M
    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
883M
        for (size_t dd = 0; dd < DIM; dd++) {
84
782M
            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
782M
        }
86
100M
    }
87
88
    // compute x_norm
89
12.6M
    float x_norm_i[NX_POINTS_PER_LOOP];
90
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
101M
        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
101M
    }
93
94
    // distances and indices
95
12.6M
    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
101M
        min_distances_i[nx_k] =
98
101M
                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
101M
    }
100
101
12.6M
    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
101M
        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
101M
    }
105
106
    //
107
12.6M
    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
12.6M
    const simd8uint32 indices_delta = simd8uint32(8);
109
110
    // main loop
111
12.6M
    size_t j = 0;
112
413M
    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
        // compute dot products for NX_POINTS from x and NY_POINTS from y
114
        // technically, we're multiplying -2x and y
115
401M
        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
117
        // DIM 0 that uses MUL
118
795M
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
394M
            simd8float32 y_i =
120
394M
                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
3.34G
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
2.95G
                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
2.95G
            }
124
394M
        }
125
126
        // other DIMs that use FMA
127
2.86G
        for (size_t dd = 1; dd < DIM; dd++) {
128
4.92G
            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
2.46G
                simd8float32 y_i =
130
2.46G
                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
132
13.1G
                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
10.6G
                    dp_i[nx_k][ny_k] =
134
10.6G
                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
10.6G
                }
136
2.46G
            }
137
2.45G
        }
138
139
        // compute y^2 + (-2x,y)
140
798M
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
397M
            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
143
3.34G
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
2.94G
                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
2.94G
            }
146
397M
        }
147
148
        // do the comparisons and alter the min indices
149
800M
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
3.53G
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
                // cmpps
152
3.13G
                cmplt_and_blend_inplace(
153
3.13G
                        dp_i[nx_k][ny_k],
154
3.13G
                        current_indices,
155
3.13G
                        min_distances_i[nx_k],
156
3.13G
                        min_indices_i[nx_k]);
157
3.13G
            }
158
159
399M
            current_indices = current_indices + indices_delta;
160
399M
        }
161
401M
    }
162
163
    // dump values and find the minimum distance / minimum index
164
114M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
101M
        float min_distances_scalar[8];
166
101M
        uint32_t min_indices_scalar[8];
167
168
101M
        min_distances_i[nx_k].storeu(min_distances_scalar);
169
101M
        min_indices_i[nx_k].storeu(min_indices_scalar);
170
171
101M
        float current_min_distance = res.dis_tab[i + nx_k];
172
101M
        uint32_t current_min_index = res.ids_tab[i + nx_k];
173
174
        // This unusual comparison is needed to maintain the behavior
175
        // of the original implementation: if two indices are
176
        // represented with equal distance values, then
177
        // the index with the min value is returned.
178
911M
        for (size_t jv = 0; jv < 8; jv++) {
179
            // add missing x_norms[i]
180
810M
            float distance_candidate =
181
810M
                    min_distances_scalar[jv] + x_norm_i[nx_k];
182
183
            // negative values can occur for identical vectors
184
            //    due to roundoff errors.
185
810M
            if (distance_candidate < 0) {
186
2.25M
                distance_candidate = 0;
187
2.25M
            }
188
189
810M
            const int64_t index_candidate = min_indices_scalar[jv];
190
191
810M
            if (current_min_distance > distance_candidate) {
192
267M
                current_min_distance = distance_candidate;
193
267M
                current_min_index = index_candidate;
194
542M
            } else if (
195
542M
                    current_min_distance == distance_candidate &&
196
542M
                    current_min_index > index_candidate) {
197
74.3k
                current_min_index = index_candidate;
198
74.3k
            }
199
810M
        }
200
201
        // process leftovers
202
101M
        for (size_t j0 = j; j0 < ny; j0++) {
203
0
            const float dp =
204
0
                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
0
            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
            // negative values can occur for identical vectors
207
            //    due to roundoff errors.
208
0
            if (dis < 0) {
209
0
                dis = 0;
210
0
            }
211
212
0
            if (current_min_distance > dis) {
213
0
                current_min_distance = dis;
214
0
                current_min_index = j0;
215
0
            }
216
0
        }
217
218
        // done
219
101M
        res.add_result(i + nx_k, current_min_distance, current_min_index);
220
101M
    }
221
12.6M
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Line
Count
Source
67
12.6M
        const size_t i) {
68
12.6M
    const size_t ny_p =
69
12.6M
            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
71
    // compute
72
12.6M
    const float* const __restrict xd_0 = x + i * DIM;
73
74
    // prefetch the next point
75
12.6M
#if defined(__AVX2__)
76
12.6M
    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
12.6M
#endif
78
79
    // load a single point from x
80
    // load -2 * value
81
12.6M
    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
883M
        for (size_t dd = 0; dd < DIM; dd++) {
84
782M
            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
782M
        }
86
100M
    }
87
88
    // compute x_norm
89
12.6M
    float x_norm_i[NX_POINTS_PER_LOOP];
90
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
101M
        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
101M
    }
93
94
    // distances and indices
95
12.6M
    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
101M
        min_distances_i[nx_k] =
98
101M
                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
101M
    }
100
101
12.6M
    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
113M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
101M
        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
101M
    }
105
106
    //
107
12.6M
    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
12.6M
    const simd8uint32 indices_delta = simd8uint32(8);
109
110
    // main loop
111
12.6M
    size_t j = 0;
112
413M
    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
        // compute dot products for NX_POINTS from x and NY_POINTS from y
114
        // technically, we're multiplying -2x and y
115
401M
        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
117
        // DIM 0 that uses MUL
118
795M
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
394M
            simd8float32 y_i =
120
394M
                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
3.34G
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
2.95G
                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
2.95G
            }
124
394M
        }
125
126
        // other DIMs that use FMA
127
2.86G
        for (size_t dd = 1; dd < DIM; dd++) {
128
4.92G
            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
2.46G
                simd8float32 y_i =
130
2.46G
                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
132
13.1G
                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
10.6G
                    dp_i[nx_k][ny_k] =
134
10.6G
                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
10.6G
                }
136
2.46G
            }
137
2.45G
        }
138
139
        // compute y^2 + (-2x,y)
140
798M
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
397M
            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
143
3.34G
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
2.94G
                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
2.94G
            }
146
397M
        }
147
148
        // do the comparisons and alter the min indices
149
800M
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
3.53G
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
                // cmpps
152
3.13G
                cmplt_and_blend_inplace(
153
3.13G
                        dp_i[nx_k][ny_k],
154
3.13G
                        current_indices,
155
3.13G
                        min_distances_i[nx_k],
156
3.13G
                        min_indices_i[nx_k]);
157
3.13G
            }
158
159
399M
            current_indices = current_indices + indices_delta;
160
399M
        }
161
401M
    }
162
163
    // dump values and find the minimum distance / minimum index
164
114M
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
101M
        float min_distances_scalar[8];
166
101M
        uint32_t min_indices_scalar[8];
167
168
101M
        min_distances_i[nx_k].storeu(min_distances_scalar);
169
101M
        min_indices_i[nx_k].storeu(min_indices_scalar);
170
171
101M
        float current_min_distance = res.dis_tab[i + nx_k];
172
101M
        uint32_t current_min_index = res.ids_tab[i + nx_k];
173
174
        // This unusual comparison is needed to maintain the behavior
175
        // of the original implementation: if two indices are
176
        // represented with equal distance values, then
177
        // the index with the min value is returned.
178
911M
        for (size_t jv = 0; jv < 8; jv++) {
179
            // add missing x_norms[i]
180
810M
            float distance_candidate =
181
810M
                    min_distances_scalar[jv] + x_norm_i[nx_k];
182
183
            // negative values can occur for identical vectors
184
            //    due to roundoff errors.
185
810M
            if (distance_candidate < 0) {
186
2.25M
                distance_candidate = 0;
187
2.25M
            }
188
189
810M
            const int64_t index_candidate = min_indices_scalar[jv];
190
191
810M
            if (current_min_distance > distance_candidate) {
192
267M
                current_min_distance = distance_candidate;
193
267M
                current_min_index = index_candidate;
194
542M
            } else if (
195
542M
                    current_min_distance == distance_candidate &&
196
542M
                    current_min_index > index_candidate) {
197
74.3k
                current_min_index = index_candidate;
198
74.3k
            }
199
810M
        }
200
201
        // process leftovers
202
101M
        for (size_t j0 = j; j0 < ny; j0++) {
203
0
            const float dp =
204
0
                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
0
            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
            // negative values can occur for identical vectors
207
            //    due to roundoff errors.
208
0
            if (dis < 0) {
209
0
                dis = 0;
210
0
            }
211
212
0
            if (current_min_distance > dis) {
213
0
                current_min_distance = dis;
214
0
                current_min_index = j0;
215
0
            }
216
0
        }
217
218
        // done
219
101M
        res.add_result(i + nx_k, current_min_distance, current_min_index);
220
101M
    }
221
12.6M
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
222
223
template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
224
void exhaustive_L2sqr_fused_cmax(
225
        const float* const __restrict x,
226
        const float* const __restrict y,
227
        size_t nx,
228
        size_t ny,
229
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
230
3.40k
        const float* __restrict y_norms) {
231
    // BLAS does not like empty matrices
232
3.40k
    if (nx == 0 || ny == 0) {
233
0
        return;
234
0
    }
235
236
    // compute norms for y
237
3.40k
    std::unique_ptr<float[]> del2;
238
3.40k
    if (!y_norms) {
239
3.40k
        float* y_norms2 = new float[ny];
240
3.40k
        del2.reset(y_norms2);
241
242
825k
        for (size_t i = 0; i < ny; i++) {
243
822k
            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
244
822k
        }
245
246
3.40k
        y_norms = y_norms2;
247
3.40k
    }
248
249
    // initialize res
250
3.40k
    res.begin_multiple(0, nx);
251
252
    // transpose y
253
3.40k
    std::vector<float> y_transposed(DIM * ny);
254
30.6k
    for (size_t j = 0; j < DIM; j++) {
255
6.60M
        for (size_t i = 0; i < ny; i++) {
256
6.57M
            y_transposed[j * ny + i] = y[j + i * DIM];
257
6.57M
        }
258
27.2k
    }
259
260
3.40k
    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
261
    // the main loop.
262
3.40k
#pragma omp parallel for schedule(dynamic)
263
17.7k
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
8.86k
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
8.86k
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
8.86k
    }
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Line
Count
Source
263
17.7k
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
8.86k
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
8.86k
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
8.86k
    }
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
267
268
3.40k
    for (size_t i = nx_p; i < nx; i++) {
269
0
        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
270
0
                x, y, y_transposed.data(), ny, res, y_norms, i);
271
0
    }
272
273
    // Does nothing for Top1BlockResultHandler, but
274
    // keeping the call for the consistency.
275
3.40k
    res.end_multiple();
276
3.40k
    InterruptCallback::check();
277
3.40k
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Line
Count
Source
230
3.40k
        const float* __restrict y_norms) {
231
    // BLAS does not like empty matrices
232
3.40k
    if (nx == 0 || ny == 0) {
233
0
        return;
234
0
    }
235
236
    // compute norms for y
237
3.40k
    std::unique_ptr<float[]> del2;
238
3.40k
    if (!y_norms) {
239
3.40k
        float* y_norms2 = new float[ny];
240
3.40k
        del2.reset(y_norms2);
241
242
825k
        for (size_t i = 0; i < ny; i++) {
243
822k
            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
244
822k
        }
245
246
3.40k
        y_norms = y_norms2;
247
3.40k
    }
248
249
    // initialize res
250
3.40k
    res.begin_multiple(0, nx);
251
252
    // transpose y
253
3.40k
    std::vector<float> y_transposed(DIM * ny);
254
30.6k
    for (size_t j = 0; j < DIM; j++) {
255
6.60M
        for (size_t i = 0; i < ny; i++) {
256
6.57M
            y_transposed[j * ny + i] = y[j + i * DIM];
257
6.57M
        }
258
27.2k
    }
259
260
3.40k
    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
261
    // the main loop.
262
3.40k
#pragma omp parallel for schedule(dynamic)
263
3.40k
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
3.40k
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
3.40k
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
3.40k
    }
267
268
3.40k
    for (size_t i = nx_p; i < nx; i++) {
269
0
        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
270
0
                x, y, y_transposed.data(), ny, res, y_norms, i);
271
0
    }
272
273
    // Does nothing for Top1BlockResultHandler, but
274
    // keeping the call for the consistency.
275
3.40k
    res.end_multiple();
276
3.40k
    InterruptCallback::check();
277
3.40k
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
278
279
} // namespace
280
281
bool exhaustive_L2sqr_fused_cmax_simdlib(
282
        const float* x,
283
        const float* y,
284
        size_t d,
285
        size_t nx,
286
        size_t ny,
287
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
288
3.57k
        const float* y_norms) {
289
    // Process only cases with certain dimensionalities.
290
    // An acceptable dimensionality value is limited by the number of
291
    // available registers.
292
293
3.57k
#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
294
3.57k
    case DIM: {                                                  \
295
3.40k
        exhaustive_L2sqr_fused_cmax<                             \
296
3.40k
                DIM,                                             \
297
3.40k
                NX_POINTS_PER_LOOP,                              \
298
3.40k
                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
299
3.40k
        return true;                                             \
300
3.40k
    }
301
302
    // faiss/benchs/bench_quantizer.py was used for benchmarking
303
    // and tuning 2nd and 3rd parameters values.
304
    // Basically, the larger the values for 2nd and 3rd parameters are,
305
    // the faster the execution is, but the more SIMD registers are needed.
306
    // This can be compensated with L1 cache, this is why this
307
    // code might operate with more registers than available
308
    // because of concurrent ports operations for ALU and LOAD/STORE.
309
310
3.57k
#if defined(__AVX2__)
311
    // It was possible to tweak these parameters on x64 machine.
312
3.57k
    switch (d) {
313
0
        DISPATCH(1, 6, 1)
314
0
        DISPATCH(2, 6, 1)
315
0
        DISPATCH(3, 6, 1)
316
0
        DISPATCH(4, 8, 1)
317
0
        DISPATCH(5, 8, 1)
318
0
        DISPATCH(6, 8, 1)
319
0
        DISPATCH(7, 8, 1)
320
3.40k
        DISPATCH(8, 8, 1)
321
0
        DISPATCH(9, 8, 1)
322
0
        DISPATCH(10, 8, 1)
323
0
        DISPATCH(11, 8, 1)
324
0
        DISPATCH(12, 8, 1)
325
0
        DISPATCH(13, 6, 1)
326
0
        DISPATCH(14, 6, 1)
327
0
        DISPATCH(15, 6, 1)
328
3.57k
        DISPATCH(16, 6, 1)
329
3.57k
    }
330
#else
331
    // Please feel free to alter 2nd and 3rd parameters if you have access
332
    // to ARM-based machine so that you are able to benchmark this code.
333
    // Or to enable other dimensions.
334
    switch (d) {
335
        DISPATCH(1, 4, 2)
336
        DISPATCH(2, 2, 2)
337
        DISPATCH(3, 2, 2)
338
        DISPATCH(4, 2, 1)
339
        DISPATCH(5, 1, 1)
340
        DISPATCH(6, 1, 1)
341
        DISPATCH(7, 1, 1)
342
        DISPATCH(8, 1, 1)
343
    }
344
#endif
345
346
179
    return false;
347
3.57k
#undef DISPATCH
348
3.57k
}
349
350
} // namespace faiss
351
352
#endif