contrib/faiss/faiss/utils/distances_fused/simdlib_based.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- c++ -*- |
9 | | |
10 | | #include <faiss/utils/distances_fused/simdlib_based.h> |
11 | | |
12 | | #if defined(__AVX2__) || defined(__aarch64__) |
13 | | |
14 | | #include <faiss/utils/simdlib.h> |
15 | | |
16 | | #if defined(__AVX2__) |
17 | | #include <immintrin.h> |
18 | | #endif |
19 | | |
20 | | namespace faiss { |
21 | | |
22 | | namespace { |
23 | | |
24 | | // It makes sense to like to overload certain cases because the further |
25 | | // kernels are in need of registers. So, let's tell compiler |
26 | | // not to waste registers on a bit faster code, if needed. |
27 | | template <size_t DIM> |
28 | 102M | float l2_sqr(const float* const x) { |
29 | | // compiler should be smart enough to handle that |
30 | 102M | float output = x[0] * x[0]; |
31 | 816M | for (size_t i = 1; i < DIM; i++) { |
32 | 714M | output += x[i] * x[i]; |
33 | 714M | } |
34 | | |
35 | 102M | return output; |
36 | 102M | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm1EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm2EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm3EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm4EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm5EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm6EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm7EEEfPKf simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm8EEEfPKf Line | Count | Source | 28 | 102M | float l2_sqr(const float* const x) { | 29 | | // compiler should be smart enough to handle that | 30 | 102M | float output = x[0] * x[0]; | 31 | 816M | for (size_t i = 1; i < DIM; i++) { | 32 | 714M | output += x[i] * x[i]; | 33 | 714M | } | 34 | | | 35 | 102M | return output; | 36 | 102M | } |
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm9EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm10EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm11EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm12EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm13EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm14EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm15EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm16EEEfPKf |
37 | | |
38 | | template <size_t DIM> |
39 | | float dot_product( |
40 | | const float* const __restrict x, |
41 | 0 | const float* const __restrict y) { |
42 | | // compiler should be smart enough to handle that |
43 | 0 | float output = x[0] * y[0]; |
44 | 0 | for (size_t i = 1; i < DIM; i++) { |
45 | 0 | output += x[i] * y[i]; |
46 | 0 | } |
47 | |
|
48 | 0 | return output; |
49 | 0 | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm1EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm2EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm3EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm4EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm5EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm6EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm7EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm8EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm9EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm10EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm11EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm12EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm13EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm14EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm15EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm16EEEfPKfS3_ |
50 | | |
51 | | // The kernel for low dimensionality vectors. |
52 | | // Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x |
53 | | // |
54 | | // DIM is the dimensionality of the data |
55 | | // NX_POINTS_PER_LOOP is the number of x points that get processed |
56 | | // simultaneously. |
57 | | // NY_POINTS_PER_LOOP is the number of y points that get processed |
58 | | // simultaneously. |
59 | | template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP> |
60 | | void kernel( |
61 | | const float* const __restrict x, |
62 | | const float* const __restrict y, |
63 | | const float* const __restrict y_transposed, |
64 | | const size_t ny, |
65 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
66 | | const float* __restrict y_norms, |
67 | 12.6M | const size_t i) { |
68 | 12.6M | const size_t ny_p = |
69 | 12.6M | (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP); |
70 | | |
71 | | // compute |
72 | 12.6M | const float* const __restrict xd_0 = x + i * DIM; |
73 | | |
74 | | // prefetch the next point |
75 | 12.6M | #if defined(__AVX2__) |
76 | 12.6M | _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA); |
77 | 12.6M | #endif |
78 | | |
79 | | // load a single point from x |
80 | | // load -2 * value |
81 | 12.6M | simd8float32 x_i[NX_POINTS_PER_LOOP][DIM]; |
82 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
83 | 883M | for (size_t dd = 0; dd < DIM; dd++) { |
84 | 782M | x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd)); |
85 | 782M | } |
86 | 100M | } |
87 | | |
88 | | // compute x_norm |
89 | 12.6M | float x_norm_i[NX_POINTS_PER_LOOP]; |
90 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
91 | 101M | x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM); |
92 | 101M | } |
93 | | |
94 | | // distances and indices |
95 | 12.6M | simd8float32 min_distances_i[NX_POINTS_PER_LOOP]; |
96 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
97 | 101M | min_distances_i[nx_k] = |
98 | 101M | simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]); |
99 | 101M | } |
100 | | |
101 | 12.6M | simd8uint32 min_indices_i[NX_POINTS_PER_LOOP]; |
102 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
103 | 101M | min_indices_i[nx_k] = simd8uint32((uint32_t)0); |
104 | 101M | } |
105 | | |
106 | | // |
107 | 12.6M | simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7); |
108 | 12.6M | const simd8uint32 indices_delta = simd8uint32(8); |
109 | | |
110 | | // main loop |
111 | 12.6M | size_t j = 0; |
112 | 413M | for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) { |
113 | | // compute dot products for NX_POINTS from x and NY_POINTS from y |
114 | | // technically, we're multiplying -2x and y |
115 | 401M | simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP]; |
116 | | |
117 | | // DIM 0 that uses MUL |
118 | 795M | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
119 | 394M | simd8float32 y_i = |
120 | 394M | simd8float32(y_transposed + j + ny_k * 8 + ny * 0); |
121 | 3.34G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
122 | 2.95G | dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i; |
123 | 2.95G | } |
124 | 394M | } |
125 | | |
126 | | // other DIMs that use FMA |
127 | 2.86G | for (size_t dd = 1; dd < DIM; dd++) { |
128 | 4.92G | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
129 | 2.46G | simd8float32 y_i = |
130 | 2.46G | simd8float32(y_transposed + j + ny_k * 8 + ny * dd); |
131 | | |
132 | 13.1G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
133 | 10.6G | dp_i[nx_k][ny_k] = |
134 | 10.6G | fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]); |
135 | 10.6G | } |
136 | 2.46G | } |
137 | 2.45G | } |
138 | | |
139 | | // compute y^2 + (-2x,y) |
140 | 798M | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
141 | 397M | simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8); |
142 | | |
143 | 3.34G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
144 | 2.94G | dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr; |
145 | 2.94G | } |
146 | 397M | } |
147 | | |
148 | | // do the comparisons and alter the min indices |
149 | 800M | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
150 | 3.53G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
151 | | // cmpps |
152 | 3.13G | cmplt_and_blend_inplace( |
153 | 3.13G | dp_i[nx_k][ny_k], |
154 | 3.13G | current_indices, |
155 | 3.13G | min_distances_i[nx_k], |
156 | 3.13G | min_indices_i[nx_k]); |
157 | 3.13G | } |
158 | | |
159 | 399M | current_indices = current_indices + indices_delta; |
160 | 399M | } |
161 | 401M | } |
162 | | |
163 | | // dump values and find the minimum distance / minimum index |
164 | 114M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
165 | 101M | float min_distances_scalar[8]; |
166 | 101M | uint32_t min_indices_scalar[8]; |
167 | | |
168 | 101M | min_distances_i[nx_k].storeu(min_distances_scalar); |
169 | 101M | min_indices_i[nx_k].storeu(min_indices_scalar); |
170 | | |
171 | 101M | float current_min_distance = res.dis_tab[i + nx_k]; |
172 | 101M | uint32_t current_min_index = res.ids_tab[i + nx_k]; |
173 | | |
174 | | // This unusual comparison is needed to maintain the behavior |
175 | | // of the original implementation: if two indices are |
176 | | // represented with equal distance values, then |
177 | | // the index with the min value is returned. |
178 | 911M | for (size_t jv = 0; jv < 8; jv++) { |
179 | | // add missing x_norms[i] |
180 | 810M | float distance_candidate = |
181 | 810M | min_distances_scalar[jv] + x_norm_i[nx_k]; |
182 | | |
183 | | // negative values can occur for identical vectors |
184 | | // due to roundoff errors. |
185 | 810M | if (distance_candidate < 0) { |
186 | 2.25M | distance_candidate = 0; |
187 | 2.25M | } |
188 | | |
189 | 810M | const int64_t index_candidate = min_indices_scalar[jv]; |
190 | | |
191 | 810M | if (current_min_distance > distance_candidate) { |
192 | 267M | current_min_distance = distance_candidate; |
193 | 267M | current_min_index = index_candidate; |
194 | 542M | } else if ( |
195 | 542M | current_min_distance == distance_candidate && |
196 | 542M | current_min_index > index_candidate) { |
197 | 74.3k | current_min_index = index_candidate; |
198 | 74.3k | } |
199 | 810M | } |
200 | | |
201 | | // process leftovers |
202 | 101M | for (size_t j0 = j; j0 < ny; j0++) { |
203 | 0 | const float dp = |
204 | 0 | dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM); |
205 | 0 | float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp; |
206 | | // negative values can occur for identical vectors |
207 | | // due to roundoff errors. |
208 | 0 | if (dis < 0) { |
209 | 0 | dis = 0; |
210 | 0 | } |
211 | |
|
212 | 0 | if (current_min_distance > dis) { |
213 | 0 | current_min_distance = dis; |
214 | 0 | current_min_index = j0; |
215 | 0 | } |
216 | 0 | } |
217 | | |
218 | | // done |
219 | 101M | res.add_result(i + nx_k, current_min_distance, current_min_index); |
220 | 101M | } |
221 | 12.6M | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Line | Count | Source | 67 | 12.6M | const size_t i) { | 68 | 12.6M | const size_t ny_p = | 69 | 12.6M | (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP); | 70 | | | 71 | | // compute | 72 | 12.6M | const float* const __restrict xd_0 = x + i * DIM; | 73 | | | 74 | | // prefetch the next point | 75 | 12.6M | #if defined(__AVX2__) | 76 | 12.6M | _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA); | 77 | 12.6M | #endif | 78 | | | 79 | | // load a single point from x | 80 | | // load -2 * value | 81 | 12.6M | simd8float32 x_i[NX_POINTS_PER_LOOP][DIM]; | 82 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 83 | 883M | for (size_t dd = 0; dd < DIM; dd++) { | 84 | 782M | x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd)); | 85 | 782M | } | 86 | 100M | } | 87 | | | 88 | | // compute x_norm | 89 | 12.6M | float x_norm_i[NX_POINTS_PER_LOOP]; | 90 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 91 | 101M | x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM); | 92 | 101M | } | 93 | | | 94 | | // distances and indices | 95 | 12.6M | simd8float32 min_distances_i[NX_POINTS_PER_LOOP]; | 96 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 97 | 101M | min_distances_i[nx_k] = | 98 | 101M | simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]); | 99 | 101M | } | 100 | | | 101 | 12.6M | simd8uint32 min_indices_i[NX_POINTS_PER_LOOP]; | 102 | 113M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 103 | 101M | min_indices_i[nx_k] = simd8uint32((uint32_t)0); | 104 | 101M | } | 105 | | | 106 | | // | 107 | 12.6M | simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7); | 108 | 12.6M | const simd8uint32 indices_delta = simd8uint32(8); | 109 | | | 110 | | // main loop | 111 | 12.6M | size_t j = 0; | 112 | 413M | for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) { | 113 | | // compute dot products for NX_POINTS from x and NY_POINTS from y | 114 | | // technically, we're multiplying -2x and y | 115 | 401M | simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP]; | 116 | | | 117 | | // DIM 0 that uses MUL | 118 | 795M | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { | 119 | 394M | simd8float32 y_i = | 120 | 394M | simd8float32(y_transposed + j + ny_k * 8 + ny * 0); | 121 | 3.34G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 122 | 2.95G | dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i; | 123 | 2.95G | } | 124 | 394M | } | 125 | | | 126 | | // other DIMs that use FMA | 127 | 2.86G | for (size_t dd = 1; dd < DIM; dd++) { | 128 | 4.92G | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { | 129 | 2.46G | simd8float32 y_i = | 130 | 2.46G | simd8float32(y_transposed + j + ny_k * 8 + ny * dd); | 131 | | | 132 | 13.1G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 133 | 10.6G | dp_i[nx_k][ny_k] = | 134 | 10.6G | fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]); | 135 | 10.6G | } | 136 | 2.46G | } | 137 | 2.45G | } | 138 | | | 139 | | // compute y^2 + (-2x,y) | 140 | 798M | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { | 141 | 397M | simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8); | 142 | | | 143 | 3.34G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 144 | 2.94G | dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr; | 145 | 2.94G | } | 146 | 397M | } | 147 | | | 148 | | // do the comparisons and alter the min indices | 149 | 800M | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { | 150 | 3.53G | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 151 | | // cmpps | 152 | 3.13G | cmplt_and_blend_inplace( | 153 | 3.13G | dp_i[nx_k][ny_k], | 154 | 3.13G | current_indices, | 155 | 3.13G | min_distances_i[nx_k], | 156 | 3.13G | min_indices_i[nx_k]); | 157 | 3.13G | } | 158 | | | 159 | 399M | current_indices = current_indices + indices_delta; | 160 | 399M | } | 161 | 401M | } | 162 | | | 163 | | // dump values and find the minimum distance / minimum index | 164 | 114M | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { | 165 | 101M | float min_distances_scalar[8]; | 166 | 101M | uint32_t min_indices_scalar[8]; | 167 | | | 168 | 101M | min_distances_i[nx_k].storeu(min_distances_scalar); | 169 | 101M | min_indices_i[nx_k].storeu(min_indices_scalar); | 170 | | | 171 | 101M | float current_min_distance = res.dis_tab[i + nx_k]; | 172 | 101M | uint32_t current_min_index = res.ids_tab[i + nx_k]; | 173 | | | 174 | | // This unusual comparison is needed to maintain the behavior | 175 | | // of the original implementation: if two indices are | 176 | | // represented with equal distance values, then | 177 | | // the index with the min value is returned. | 178 | 911M | for (size_t jv = 0; jv < 8; jv++) { | 179 | | // add missing x_norms[i] | 180 | 810M | float distance_candidate = | 181 | 810M | min_distances_scalar[jv] + x_norm_i[nx_k]; | 182 | | | 183 | | // negative values can occur for identical vectors | 184 | | // due to roundoff errors. | 185 | 810M | if (distance_candidate < 0) { | 186 | 2.25M | distance_candidate = 0; | 187 | 2.25M | } | 188 | | | 189 | 810M | const int64_t index_candidate = min_indices_scalar[jv]; | 190 | | | 191 | 810M | if (current_min_distance > distance_candidate) { | 192 | 267M | current_min_distance = distance_candidate; | 193 | 267M | current_min_index = index_candidate; | 194 | 542M | } else if ( | 195 | 542M | current_min_distance == distance_candidate && | 196 | 542M | current_min_index > index_candidate) { | 197 | 74.3k | current_min_index = index_candidate; | 198 | 74.3k | } | 199 | 810M | } | 200 | | | 201 | | // process leftovers | 202 | 101M | for (size_t j0 = j; j0 < ny; j0++) { | 203 | 0 | const float dp = | 204 | 0 | dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM); | 205 | 0 | float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp; | 206 | | // negative values can occur for identical vectors | 207 | | // due to roundoff errors. | 208 | 0 | if (dis < 0) { | 209 | 0 | dis = 0; | 210 | 0 | } | 211 | |
| 212 | 0 | if (current_min_distance > dis) { | 213 | 0 | current_min_distance = dis; | 214 | 0 | current_min_index = j0; | 215 | 0 | } | 216 | 0 | } | 217 | | | 218 | | // done | 219 | 101M | res.add_result(i + nx_k, current_min_distance, current_min_index); | 220 | 101M | } | 221 | 12.6M | } |
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m |
222 | | |
223 | | template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP> |
224 | | void exhaustive_L2sqr_fused_cmax( |
225 | | const float* const __restrict x, |
226 | | const float* const __restrict y, |
227 | | size_t nx, |
228 | | size_t ny, |
229 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
230 | 3.40k | const float* __restrict y_norms) { |
231 | | // BLAS does not like empty matrices |
232 | 3.40k | if (nx == 0 || ny == 0) { |
233 | 0 | return; |
234 | 0 | } |
235 | | |
236 | | // compute norms for y |
237 | 3.40k | std::unique_ptr<float[]> del2; |
238 | 3.40k | if (!y_norms) { |
239 | 3.40k | float* y_norms2 = new float[ny]; |
240 | 3.40k | del2.reset(y_norms2); |
241 | | |
242 | 825k | for (size_t i = 0; i < ny; i++) { |
243 | 822k | y_norms2[i] = l2_sqr<DIM>(y + i * DIM); |
244 | 822k | } |
245 | | |
246 | 3.40k | y_norms = y_norms2; |
247 | 3.40k | } |
248 | | |
249 | | // initialize res |
250 | 3.40k | res.begin_multiple(0, nx); |
251 | | |
252 | | // transpose y |
253 | 3.40k | std::vector<float> y_transposed(DIM * ny); |
254 | 30.6k | for (size_t j = 0; j < DIM; j++) { |
255 | 6.60M | for (size_t i = 0; i < ny; i++) { |
256 | 6.57M | y_transposed[j * ny + i] = y[j + i * DIM]; |
257 | 6.57M | } |
258 | 27.2k | } |
259 | | |
260 | 3.40k | const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP; |
261 | | // the main loop. |
262 | 3.40k | #pragma omp parallel for schedule(dynamic) |
263 | 17.7k | for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) { |
264 | 8.86k | kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>( |
265 | 8.86k | x, y, y_transposed.data(), ny, res, y_norms, i); |
266 | 8.86k | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Line | Count | Source | 263 | 17.7k | for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) { | 264 | 8.86k | kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>( | 265 | 8.86k | x, y, y_transposed.data(), ny, res, y_norms, i); | 266 | 8.86k | } |
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ |
267 | | |
268 | 3.40k | for (size_t i = nx_p; i < nx; i++) { |
269 | 0 | kernel<DIM, 1, NY_POINTS_PER_LOOP>( |
270 | 0 | x, y, y_transposed.data(), ny, res, y_norms, i); |
271 | 0 | } |
272 | | |
273 | | // Does nothing for Top1BlockResultHandler, but |
274 | | // keeping the call for the consistency. |
275 | 3.40k | res.end_multiple(); |
276 | 3.40k | InterruptCallback::check(); |
277 | 3.40k | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Line | Count | Source | 230 | 3.40k | const float* __restrict y_norms) { | 231 | | // BLAS does not like empty matrices | 232 | 3.40k | if (nx == 0 || ny == 0) { | 233 | 0 | return; | 234 | 0 | } | 235 | | | 236 | | // compute norms for y | 237 | 3.40k | std::unique_ptr<float[]> del2; | 238 | 3.40k | if (!y_norms) { | 239 | 3.40k | float* y_norms2 = new float[ny]; | 240 | 3.40k | del2.reset(y_norms2); | 241 | | | 242 | 825k | for (size_t i = 0; i < ny; i++) { | 243 | 822k | y_norms2[i] = l2_sqr<DIM>(y + i * DIM); | 244 | 822k | } | 245 | | | 246 | 3.40k | y_norms = y_norms2; | 247 | 3.40k | } | 248 | | | 249 | | // initialize res | 250 | 3.40k | res.begin_multiple(0, nx); | 251 | | | 252 | | // transpose y | 253 | 3.40k | std::vector<float> y_transposed(DIM * ny); | 254 | 30.6k | for (size_t j = 0; j < DIM; j++) { | 255 | 6.60M | for (size_t i = 0; i < ny; i++) { | 256 | 6.57M | y_transposed[j * ny + i] = y[j + i * DIM]; | 257 | 6.57M | } | 258 | 27.2k | } | 259 | | | 260 | 3.40k | const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP; | 261 | | // the main loop. | 262 | 3.40k | #pragma omp parallel for schedule(dynamic) | 263 | 3.40k | for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) { | 264 | 3.40k | kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>( | 265 | 3.40k | x, y, y_transposed.data(), ny, res, y_norms, i); | 266 | 3.40k | } | 267 | | | 268 | 3.40k | for (size_t i = nx_p; i < nx; i++) { | 269 | 0 | kernel<DIM, 1, NY_POINTS_PER_LOOP>( | 270 | 0 | x, y, y_transposed.data(), ny, res, y_norms, i); | 271 | 0 | } | 272 | | | 273 | | // Does nothing for Top1BlockResultHandler, but | 274 | | // keeping the call for the consistency. | 275 | 3.40k | res.end_multiple(); | 276 | 3.40k | InterruptCallback::check(); | 277 | 3.40k | } |
Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ |
278 | | |
279 | | } // namespace |
280 | | |
281 | | bool exhaustive_L2sqr_fused_cmax_simdlib( |
282 | | const float* x, |
283 | | const float* y, |
284 | | size_t d, |
285 | | size_t nx, |
286 | | size_t ny, |
287 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
288 | 3.57k | const float* y_norms) { |
289 | | // Process only cases with certain dimensionalities. |
290 | | // An acceptable dimensionality value is limited by the number of |
291 | | // available registers. |
292 | | |
293 | 3.57k | #define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP) \ |
294 | 3.57k | case DIM: { \ |
295 | 3.40k | exhaustive_L2sqr_fused_cmax< \ |
296 | 3.40k | DIM, \ |
297 | 3.40k | NX_POINTS_PER_LOOP, \ |
298 | 3.40k | NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \ |
299 | 3.40k | return true; \ |
300 | 3.40k | } |
301 | | |
302 | | // faiss/benchs/bench_quantizer.py was used for benchmarking |
303 | | // and tuning 2nd and 3rd parameters values. |
304 | | // Basically, the larger the values for 2nd and 3rd parameters are, |
305 | | // the faster the execution is, but the more SIMD registers are needed. |
306 | | // This can be compensated with L1 cache, this is why this |
307 | | // code might operate with more registers than available |
308 | | // because of concurrent ports operations for ALU and LOAD/STORE. |
309 | | |
310 | 3.57k | #if defined(__AVX2__) |
311 | | // It was possible to tweak these parameters on x64 machine. |
312 | 3.57k | switch (d) { |
313 | 0 | DISPATCH(1, 6, 1) |
314 | 0 | DISPATCH(2, 6, 1) |
315 | 0 | DISPATCH(3, 6, 1) |
316 | 0 | DISPATCH(4, 8, 1) |
317 | 0 | DISPATCH(5, 8, 1) |
318 | 0 | DISPATCH(6, 8, 1) |
319 | 0 | DISPATCH(7, 8, 1) |
320 | 3.40k | DISPATCH(8, 8, 1) |
321 | 0 | DISPATCH(9, 8, 1) |
322 | 0 | DISPATCH(10, 8, 1) |
323 | 0 | DISPATCH(11, 8, 1) |
324 | 0 | DISPATCH(12, 8, 1) |
325 | 0 | DISPATCH(13, 6, 1) |
326 | 0 | DISPATCH(14, 6, 1) |
327 | 0 | DISPATCH(15, 6, 1) |
328 | 3.57k | DISPATCH(16, 6, 1) |
329 | 3.57k | } |
330 | | #else |
331 | | // Please feel free to alter 2nd and 3rd parameters if you have access |
332 | | // to ARM-based machine so that you are able to benchmark this code. |
333 | | // Or to enable other dimensions. |
334 | | switch (d) { |
335 | | DISPATCH(1, 4, 2) |
336 | | DISPATCH(2, 2, 2) |
337 | | DISPATCH(3, 2, 2) |
338 | | DISPATCH(4, 2, 1) |
339 | | DISPATCH(5, 1, 1) |
340 | | DISPATCH(6, 1, 1) |
341 | | DISPATCH(7, 1, 1) |
342 | | DISPATCH(8, 1, 1) |
343 | | } |
344 | | #endif |
345 | | |
346 | 179 | return false; |
347 | 3.57k | #undef DISPATCH |
348 | 3.57k | } |
349 | | |
350 | | } // namespace faiss |
351 | | |
352 | | #endif |