Coverage Report

Created: 2026-06-29 18:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
contrib/faiss/faiss/utils/distances_fused/simdlib_based.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
// -*- c++ -*-
9
10
#include <faiss/utils/distances_fused/simdlib_based.h>
11
12
#if defined(__AVX2__) || defined(__aarch64__)
13
14
#include <faiss/utils/simdlib.h>
15
16
#if defined(__AVX2__)
17
#include <immintrin.h>
18
#endif
19
20
namespace faiss {
21
22
namespace {
23
24
// It makes sense to like to overload certain cases because the further
25
// kernels are in need of registers. So, let's tell compiler
26
// not to waste registers on a bit faster code, if needed.
27
template <size_t DIM>
28
2.06k
float l2_sqr(const float* const x) {
29
    // compiler should be smart enough to handle that
30
2.06k
    float output = x[0] * x[0];
31
29.4k
    for (size_t i = 1; i < DIM; i++) {
32
27.3k
        output += x[i] * x[i];
33
27.3k
    }
34
35
2.06k
    return output;
36
2.06k
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm1EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm2EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm3EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm4EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm5EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm6EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm7EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm8EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm9EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm10EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm11EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm12EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm13EEEfPKf
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm14EEEfPKf
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm15EEEfPKf
Line
Count
Source
28
2.06k
float l2_sqr(const float* const x) {
29
    // compiler should be smart enough to handle that
30
2.06k
    float output = x[0] * x[0];
31
29.4k
    for (size_t i = 1; i < DIM; i++) {
32
27.3k
        output += x[i] * x[i];
33
27.3k
    }
34
35
2.06k
    return output;
36
2.06k
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm16EEEfPKf
37
38
template <size_t DIM>
39
float dot_product(
40
        const float* const __restrict x,
41
6.87k
        const float* const __restrict y) {
42
    // compiler should be smart enough to handle that
43
6.87k
    float output = x[0] * y[0];
44
93.1k
    for (size_t i = 1; i < DIM; i++) {
45
86.2k
        output += x[i] * y[i];
46
86.2k
    }
47
48
6.87k
    return output;
49
6.87k
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm1EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm2EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm3EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm4EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm5EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm6EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm7EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm8EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm9EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm10EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm11EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm12EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm13EEEfPKfS3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm14EEEfPKfS3_
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm15EEEfPKfS3_
Line
Count
Source
41
6.87k
        const float* const __restrict y) {
42
    // compiler should be smart enough to handle that
43
6.87k
    float output = x[0] * y[0];
44
93.1k
    for (size_t i = 1; i < DIM; i++) {
45
86.2k
        output += x[i] * y[i];
46
86.2k
    }
47
48
6.87k
    return output;
49
6.87k
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm16EEEfPKfS3_
50
51
// The kernel for low dimensionality vectors.
52
// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
53
//
54
// DIM is the dimensionality of the data
55
// NX_POINTS_PER_LOOP is the number of x points that get processed
56
//   simultaneously.
57
// NY_POINTS_PER_LOOP is the number of y points that get processed
58
//   simultaneously.
59
template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
60
void kernel(
61
        const float* const __restrict x,
62
        const float* const __restrict y,
63
        const float* const __restrict y_transposed,
64
        const size_t ny,
65
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
66
        const float* __restrict y_norms,
67
416
        const size_t i) {
68
416
    const size_t ny_p =
69
416
            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
71
    // compute
72
416
    const float* const __restrict xd_0 = x + i * DIM;
73
74
    // prefetch the next point
75
416
#if defined(__AVX2__)
76
416
    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
416
#endif
78
79
    // load a single point from x
80
    // load -2 * value
81
416
    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
2.37k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
23.3k
        for (size_t dd = 0; dd < DIM; dd++) {
84
21.4k
            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
21.4k
        }
86
1.96k
    }
87
88
    // compute x_norm
89
416
    float x_norm_i[NX_POINTS_PER_LOOP];
90
2.40k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
1.98k
        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
1.98k
    }
93
94
    // distances and indices
95
416
    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
2.36k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
1.94k
        min_distances_i[nx_k] =
98
1.94k
                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
1.94k
    }
100
101
416
    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
2.37k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
1.96k
        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
1.96k
    }
105
106
    //
107
416
    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
416
    const simd8uint32 indices_delta = simd8uint32(8);
109
110
    // main loop
111
416
    size_t j = 0;
112
416
    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
        // compute dot products for NX_POINTS from x and NY_POINTS from y
114
        // technically, we're multiplying -2x and y
115
0
        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
117
        // DIM 0 that uses MUL
118
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
0
            simd8float32 y_i =
120
0
                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
0
                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
0
            }
124
0
        }
125
126
        // other DIMs that use FMA
127
0
        for (size_t dd = 1; dd < DIM; dd++) {
128
0
            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
0
                simd8float32 y_i =
130
0
                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
132
0
                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
0
                    dp_i[nx_k][ny_k] =
134
0
                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
0
                }
136
0
            }
137
0
        }
138
139
        // compute y^2 + (-2x,y)
140
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
0
            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
143
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
0
                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
0
            }
146
0
        }
147
148
        // do the comparisons and alter the min indices
149
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
                // cmpps
152
0
                cmplt_and_blend_inplace(
153
0
                        dp_i[nx_k][ny_k],
154
0
                        current_indices,
155
0
                        min_distances_i[nx_k],
156
0
                        min_indices_i[nx_k]);
157
0
            }
158
159
0
            current_indices = current_indices + indices_delta;
160
0
        }
161
0
    }
162
163
    // dump values and find the minimum distance / minimum index
164
2.35k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
1.93k
        float min_distances_scalar[8];
166
1.93k
        uint32_t min_indices_scalar[8];
167
168
1.93k
        min_distances_i[nx_k].storeu(min_distances_scalar);
169
1.93k
        min_indices_i[nx_k].storeu(min_indices_scalar);
170
171
1.93k
        float current_min_distance = res.dis_tab[i + nx_k];
172
1.93k
        uint32_t current_min_index = res.ids_tab[i + nx_k];
173
174
        // This unusual comparison is needed to maintain the behavior
175
        // of the original implementation: if two indices are
176
        // represented with equal distance values, then
177
        // the index with the min value is returned.
178
15.9k
        for (size_t jv = 0; jv < 8; jv++) {
179
            // add missing x_norms[i]
180
14.0k
            float distance_candidate =
181
14.0k
                    min_distances_scalar[jv] + x_norm_i[nx_k];
182
183
            // negative values can occur for identical vectors
184
            //    due to roundoff errors.
185
14.0k
            if (distance_candidate < 0) {
186
0
                distance_candidate = 0;
187
0
            }
188
189
14.0k
            const int64_t index_candidate = min_indices_scalar[jv];
190
191
14.0k
            if (current_min_distance > distance_candidate) {
192
0
                current_min_distance = distance_candidate;
193
0
                current_min_index = index_candidate;
194
14.0k
            } else if (
195
14.0k
                    current_min_distance == distance_candidate &&
196
14.0k
                    current_min_index > index_candidate) {
197
1.53k
                current_min_index = index_candidate;
198
1.53k
            }
199
14.0k
        }
200
201
        // process leftovers
202
9.05k
        for (size_t j0 = j; j0 < ny; j0++) {
203
7.12k
            const float dp =
204
7.12k
                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
7.12k
            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
            // negative values can occur for identical vectors
207
            //    due to roundoff errors.
208
7.12k
            if (dis < 0) {
209
0
                dis = 0;
210
0
            }
211
212
7.12k
            if (current_min_distance > dis) {
213
3.96k
                current_min_distance = dis;
214
3.96k
                current_min_index = j0;
215
3.96k
            }
216
7.12k
        }
217
218
        // done
219
1.93k
        res.add_result(i + nx_k, current_min_distance, current_min_index);
220
1.93k
    }
221
416
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Line
Count
Source
67
328
        const size_t i) {
68
328
    const size_t ny_p =
69
328
            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
71
    // compute
72
328
    const float* const __restrict xd_0 = x + i * DIM;
73
74
    // prefetch the next point
75
328
#if defined(__AVX2__)
76
328
    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
328
#endif
78
79
    // load a single point from x
80
    // load -2 * value
81
328
    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
2.20k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
21.9k
        for (size_t dd = 0; dd < DIM; dd++) {
84
20.0k
            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
20.0k
        }
86
1.87k
    }
87
88
    // compute x_norm
89
328
    float x_norm_i[NX_POINTS_PER_LOOP];
90
2.22k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
1.89k
        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
1.89k
    }
93
94
    // distances and indices
95
328
    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
2.18k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
1.85k
        min_distances_i[nx_k] =
98
1.85k
                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
1.85k
    }
100
101
328
    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
2.20k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
1.87k
        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
1.87k
    }
105
106
    //
107
328
    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
328
    const simd8uint32 indices_delta = simd8uint32(8);
109
110
    // main loop
111
328
    size_t j = 0;
112
328
    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
        // compute dot products for NX_POINTS from x and NY_POINTS from y
114
        // technically, we're multiplying -2x and y
115
0
        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
117
        // DIM 0 that uses MUL
118
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
0
            simd8float32 y_i =
120
0
                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
0
                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
0
            }
124
0
        }
125
126
        // other DIMs that use FMA
127
0
        for (size_t dd = 1; dd < DIM; dd++) {
128
0
            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
0
                simd8float32 y_i =
130
0
                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
132
0
                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
0
                    dp_i[nx_k][ny_k] =
134
0
                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
0
                }
136
0
            }
137
0
        }
138
139
        // compute y^2 + (-2x,y)
140
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
0
            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
143
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
0
                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
0
            }
146
0
        }
147
148
        // do the comparisons and alter the min indices
149
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
                // cmpps
152
0
                cmplt_and_blend_inplace(
153
0
                        dp_i[nx_k][ny_k],
154
0
                        current_indices,
155
0
                        min_distances_i[nx_k],
156
0
                        min_indices_i[nx_k]);
157
0
            }
158
159
0
            current_indices = current_indices + indices_delta;
160
0
        }
161
0
    }
162
163
    // dump values and find the minimum distance / minimum index
164
2.17k
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
1.84k
        float min_distances_scalar[8];
166
1.84k
        uint32_t min_indices_scalar[8];
167
168
1.84k
        min_distances_i[nx_k].storeu(min_distances_scalar);
169
1.84k
        min_indices_i[nx_k].storeu(min_indices_scalar);
170
171
1.84k
        float current_min_distance = res.dis_tab[i + nx_k];
172
1.84k
        uint32_t current_min_index = res.ids_tab[i + nx_k];
173
174
        // This unusual comparison is needed to maintain the behavior
175
        // of the original implementation: if two indices are
176
        // represented with equal distance values, then
177
        // the index with the min value is returned.
178
15.1k
        for (size_t jv = 0; jv < 8; jv++) {
179
            // add missing x_norms[i]
180
13.3k
            float distance_candidate =
181
13.3k
                    min_distances_scalar[jv] + x_norm_i[nx_k];
182
183
            // negative values can occur for identical vectors
184
            //    due to roundoff errors.
185
13.3k
            if (distance_candidate < 0) {
186
0
                distance_candidate = 0;
187
0
            }
188
189
13.3k
            const int64_t index_candidate = min_indices_scalar[jv];
190
191
13.3k
            if (current_min_distance > distance_candidate) {
192
0
                current_min_distance = distance_candidate;
193
0
                current_min_index = index_candidate;
194
13.3k
            } else if (
195
13.3k
                    current_min_distance == distance_candidate &&
196
13.3k
                    current_min_index > index_candidate) {
197
1.48k
                current_min_index = index_candidate;
198
1.48k
            }
199
13.3k
        }
200
201
        // process leftovers
202
8.61k
        for (size_t j0 = j; j0 < ny; j0++) {
203
6.77k
            const float dp =
204
6.77k
                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
6.77k
            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
            // negative values can occur for identical vectors
207
            //    due to roundoff errors.
208
6.77k
            if (dis < 0) {
209
0
                dis = 0;
210
0
            }
211
212
6.77k
            if (current_min_distance > dis) {
213
3.81k
                current_min_distance = dis;
214
3.81k
                current_min_index = j0;
215
3.81k
            }
216
6.77k
        }
217
218
        // done
219
1.84k
        res.add_result(i + nx_k, current_min_distance, current_min_index);
220
1.84k
    }
221
328
}
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Line
Count
Source
67
88
        const size_t i) {
68
88
    const size_t ny_p =
69
88
            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
71
    // compute
72
88
    const float* const __restrict xd_0 = x + i * DIM;
73
74
    // prefetch the next point
75
88
#if defined(__AVX2__)
76
88
    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
88
#endif
78
79
    // load a single point from x
80
    // load -2 * value
81
88
    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
176
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
1.40k
        for (size_t dd = 0; dd < DIM; dd++) {
84
1.32k
            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
1.32k
        }
86
88
    }
87
88
    // compute x_norm
89
88
    float x_norm_i[NX_POINTS_PER_LOOP];
90
176
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
88
        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
88
    }
93
94
    // distances and indices
95
88
    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
176
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
88
        min_distances_i[nx_k] =
98
88
                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
88
    }
100
101
88
    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
176
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
88
        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
88
    }
105
106
    //
107
88
    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
88
    const simd8uint32 indices_delta = simd8uint32(8);
109
110
    // main loop
111
88
    size_t j = 0;
112
88
    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
        // compute dot products for NX_POINTS from x and NY_POINTS from y
114
        // technically, we're multiplying -2x and y
115
0
        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
117
        // DIM 0 that uses MUL
118
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
0
            simd8float32 y_i =
120
0
                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
0
                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
0
            }
124
0
        }
125
126
        // other DIMs that use FMA
127
0
        for (size_t dd = 1; dd < DIM; dd++) {
128
0
            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
0
                simd8float32 y_i =
130
0
                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
132
0
                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
0
                    dp_i[nx_k][ny_k] =
134
0
                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
0
                }
136
0
            }
137
0
        }
138
139
        // compute y^2 + (-2x,y)
140
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
0
            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
143
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
0
                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
0
            }
146
0
        }
147
148
        // do the comparisons and alter the min indices
149
0
        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
0
            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
                // cmpps
152
0
                cmplt_and_blend_inplace(
153
0
                        dp_i[nx_k][ny_k],
154
0
                        current_indices,
155
0
                        min_distances_i[nx_k],
156
0
                        min_indices_i[nx_k]);
157
0
            }
158
159
0
            current_indices = current_indices + indices_delta;
160
0
        }
161
0
    }
162
163
    // dump values and find the minimum distance / minimum index
164
176
    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
88
        float min_distances_scalar[8];
166
88
        uint32_t min_indices_scalar[8];
167
168
88
        min_distances_i[nx_k].storeu(min_distances_scalar);
169
88
        min_indices_i[nx_k].storeu(min_indices_scalar);
170
171
88
        float current_min_distance = res.dis_tab[i + nx_k];
172
88
        uint32_t current_min_index = res.ids_tab[i + nx_k];
173
174
        // This unusual comparison is needed to maintain the behavior
175
        // of the original implementation: if two indices are
176
        // represented with equal distance values, then
177
        // the index with the min value is returned.
178
792
        for (size_t jv = 0; jv < 8; jv++) {
179
            // add missing x_norms[i]
180
704
            float distance_candidate =
181
704
                    min_distances_scalar[jv] + x_norm_i[nx_k];
182
183
            // negative values can occur for identical vectors
184
            //    due to roundoff errors.
185
704
            if (distance_candidate < 0) {
186
0
                distance_candidate = 0;
187
0
            }
188
189
704
            const int64_t index_candidate = min_indices_scalar[jv];
190
191
704
            if (current_min_distance > distance_candidate) {
192
0
                current_min_distance = distance_candidate;
193
0
                current_min_index = index_candidate;
194
704
            } else if (
195
704
                    current_min_distance == distance_candidate &&
196
704
                    current_min_index > index_candidate) {
197
48
                current_min_index = index_candidate;
198
48
            }
199
704
        }
200
201
        // process leftovers
202
440
        for (size_t j0 = j; j0 < ny; j0++) {
203
352
            const float dp =
204
352
                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
352
            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
            // negative values can occur for identical vectors
207
            //    due to roundoff errors.
208
352
            if (dis < 0) {
209
0
                dis = 0;
210
0
            }
211
212
352
            if (current_min_distance > dis) {
213
146
                current_min_distance = dis;
214
146
                current_min_index = j0;
215
146
            }
216
352
        }
217
218
        // done
219
88
        res.add_result(i + nx_k, current_min_distance, current_min_index);
220
88
    }
221
88
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m
222
223
template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
224
void exhaustive_L2sqr_fused_cmax(
225
        const float* const __restrict x,
226
        const float* const __restrict y,
227
        size_t nx,
228
        size_t ny,
229
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
230
22
        const float* __restrict y_norms) {
231
    // BLAS does not like empty matrices
232
22
    if (nx == 0 || ny == 0) {
233
0
        return;
234
0
    }
235
236
    // compute norms for y
237
22
    std::unique_ptr<float[]> del2;
238
22
    if (!y_norms) {
239
22
        float* y_norms2 = new float[ny];
240
22
        del2.reset(y_norms2);
241
242
110
        for (size_t i = 0; i < ny; i++) {
243
88
            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
244
88
        }
245
246
22
        y_norms = y_norms2;
247
22
    }
248
249
    // initialize res
250
22
    res.begin_multiple(0, nx);
251
252
    // transpose y
253
22
    std::vector<float> y_transposed(DIM * ny);
254
352
    for (size_t j = 0; j < DIM; j++) {
255
1.65k
        for (size_t i = 0; i < ny; i++) {
256
1.32k
            y_transposed[j * ny + i] = y[j + i * DIM];
257
1.32k
        }
258
330
    }
259
260
22
    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
261
    // the main loop.
262
22
#pragma omp parallel for schedule(dynamic)
263
98
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
49
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
49
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
49
    }
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
Line
Count
Source
263
98
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
49
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
49
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
49
    }
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__
267
268
110
    for (size_t i = nx_p; i < nx; i++) {
269
88
        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
270
88
                x, y, y_transposed.data(), ny, res, y_norms, i);
271
88
    }
272
273
    // Does nothing for Top1BlockResultHandler, but
274
    // keeping the call for the consistency.
275
22
    res.end_multiple();
276
22
    InterruptCallback::check();
277
22
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
Line
Count
Source
230
22
        const float* __restrict y_norms) {
231
    // BLAS does not like empty matrices
232
22
    if (nx == 0 || ny == 0) {
233
0
        return;
234
0
    }
235
236
    // compute norms for y
237
22
    std::unique_ptr<float[]> del2;
238
22
    if (!y_norms) {
239
22
        float* y_norms2 = new float[ny];
240
22
        del2.reset(y_norms2);
241
242
110
        for (size_t i = 0; i < ny; i++) {
243
88
            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
244
88
        }
245
246
22
        y_norms = y_norms2;
247
22
    }
248
249
    // initialize res
250
22
    res.begin_multiple(0, nx);
251
252
    // transpose y
253
22
    std::vector<float> y_transposed(DIM * ny);
254
352
    for (size_t j = 0; j < DIM; j++) {
255
1.65k
        for (size_t i = 0; i < ny; i++) {
256
1.32k
            y_transposed[j * ny + i] = y[j + i * DIM];
257
1.32k
        }
258
330
    }
259
260
22
    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
261
    // the main loop.
262
22
#pragma omp parallel for schedule(dynamic)
263
22
    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
22
        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
22
                x, y, y_transposed.data(), ny, res, y_norms, i);
266
22
    }
267
268
110
    for (size_t i = nx_p; i < nx; i++) {
269
88
        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
270
88
                x, y, y_transposed.data(), ny, res, y_norms, i);
271
88
    }
272
273
    // Does nothing for Top1BlockResultHandler, but
274
    // keeping the call for the consistency.
275
22
    res.end_multiple();
276
22
    InterruptCallback::check();
277
22
}
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_
278
279
} // namespace
280
281
bool exhaustive_L2sqr_fused_cmax_simdlib(
282
        const float* x,
283
        const float* y,
284
        size_t d,
285
        size_t nx,
286
        size_t ny,
287
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
288
146
        const float* y_norms) {
289
    // Process only cases with certain dimensionalities.
290
    // An acceptable dimensionality value is limited by the number of
291
    // available registers.
292
293
146
#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
294
146
    case DIM: {                                                  \
295
22
        exhaustive_L2sqr_fused_cmax<                             \
296
22
                DIM,                                             \
297
22
                NX_POINTS_PER_LOOP,                              \
298
22
                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
299
22
        return true;                                             \
300
22
    }
301
302
    // faiss/benchs/bench_quantizer.py was used for benchmarking
303
    // and tuning 2nd and 3rd parameters values.
304
    // Basically, the larger the values for 2nd and 3rd parameters are,
305
    // the faster the execution is, but the more SIMD registers are needed.
306
    // This can be compensated with L1 cache, this is why this
307
    // code might operate with more registers than available
308
    // because of concurrent ports operations for ALU and LOAD/STORE.
309
310
146
#if defined(__AVX2__)
311
    // It was possible to tweak these parameters on x64 machine.
312
146
    switch (d) {
313
0
        DISPATCH(1, 6, 1)
314
0
        DISPATCH(2, 6, 1)
315
0
        DISPATCH(3, 6, 1)
316
0
        DISPATCH(4, 8, 1)
317
0
        DISPATCH(5, 8, 1)
318
0
        DISPATCH(6, 8, 1)
319
0
        DISPATCH(7, 8, 1)
320
0
        DISPATCH(8, 8, 1)
321
0
        DISPATCH(9, 8, 1)
322
0
        DISPATCH(10, 8, 1)
323
0
        DISPATCH(11, 8, 1)
324
0
        DISPATCH(12, 8, 1)
325
0
        DISPATCH(13, 6, 1)
326
0
        DISPATCH(14, 6, 1)
327
22
        DISPATCH(15, 6, 1)
328
146
        DISPATCH(16, 6, 1)
329
146
    }
330
#else
331
    // Please feel free to alter 2nd and 3rd parameters if you have access
332
    // to ARM-based machine so that you are able to benchmark this code.
333
    // Or to enable other dimensions.
334
    switch (d) {
335
        DISPATCH(1, 4, 2)
336
        DISPATCH(2, 2, 2)
337
        DISPATCH(3, 2, 2)
338
        DISPATCH(4, 2, 1)
339
        DISPATCH(5, 1, 1)
340
        DISPATCH(6, 1, 1)
341
        DISPATCH(7, 1, 1)
342
        DISPATCH(8, 1, 1)
343
    }
344
#endif
345
346
124
    return false;
347
146
#undef DISPATCH
348
146
}
349
350
} // namespace faiss
351
352
#endif