contrib/faiss/faiss/utils/distances_fused/simdlib_based.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- c++ -*- |
9 | | |
10 | | #include <faiss/utils/distances_fused/simdlib_based.h> |
11 | | |
12 | | #if defined(__AVX2__) || defined(__aarch64__) |
13 | | |
14 | | #include <faiss/utils/simdlib.h> |
15 | | |
16 | | #if defined(__AVX2__) |
17 | | #include <immintrin.h> |
18 | | #endif |
19 | | |
20 | | namespace faiss { |
21 | | |
22 | | namespace { |
23 | | |
24 | | // It makes sense to like to overload certain cases because the further |
25 | | // kernels are in need of registers. So, let's tell compiler |
26 | | // not to waste registers on a bit faster code, if needed. |
27 | | template <size_t DIM> |
28 | 0 | float l2_sqr(const float* const x) { |
29 | | // compiler should be smart enough to handle that |
30 | 0 | float output = x[0] * x[0]; |
31 | 0 | for (size_t i = 1; i < DIM; i++) { |
32 | 0 | output += x[i] * x[i]; |
33 | 0 | } |
34 | |
|
35 | 0 | return output; |
36 | 0 | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm1EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm2EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm3EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm4EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm5EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm6EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm7EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm8EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm9EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm10EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm11EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm12EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm13EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm14EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm15EEEfPKf Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16l2_sqrILm16EEEfPKf |
37 | | |
38 | | template <size_t DIM> |
39 | | float dot_product( |
40 | | const float* const __restrict x, |
41 | 0 | const float* const __restrict y) { |
42 | | // compiler should be smart enough to handle that |
43 | 0 | float output = x[0] * y[0]; |
44 | 0 | for (size_t i = 1; i < DIM; i++) { |
45 | 0 | output += x[i] * y[i]; |
46 | 0 | } |
47 | |
|
48 | 0 | return output; |
49 | 0 | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm1EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm2EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm3EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm4EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm5EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm6EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm7EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm8EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm9EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm10EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm11EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm12EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm13EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm14EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm15EEEfPKfS3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_111dot_productILm16EEEfPKfS3_ |
50 | | |
51 | | // The kernel for low dimensionality vectors. |
52 | | // Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x |
53 | | // |
54 | | // DIM is the dimensionality of the data |
55 | | // NX_POINTS_PER_LOOP is the number of x points that get processed |
56 | | // simultaneously. |
57 | | // NY_POINTS_PER_LOOP is the number of y points that get processed |
58 | | // simultaneously. |
59 | | template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP> |
60 | | void kernel( |
61 | | const float* const __restrict x, |
62 | | const float* const __restrict y, |
63 | | const float* const __restrict y_transposed, |
64 | | const size_t ny, |
65 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
66 | | const float* __restrict y_norms, |
67 | 0 | const size_t i) { |
68 | 0 | const size_t ny_p = |
69 | 0 | (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP); |
70 | | |
71 | | // compute |
72 | 0 | const float* const __restrict xd_0 = x + i * DIM; |
73 | | |
74 | | // prefetch the next point |
75 | 0 | #if defined(__AVX2__) |
76 | 0 | _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA); |
77 | 0 | #endif |
78 | | |
79 | | // load a single point from x |
80 | | // load -2 * value |
81 | 0 | simd8float32 x_i[NX_POINTS_PER_LOOP][DIM]; |
82 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
83 | 0 | for (size_t dd = 0; dd < DIM; dd++) { |
84 | 0 | x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd)); |
85 | 0 | } |
86 | 0 | } |
87 | | |
88 | | // compute x_norm |
89 | 0 | float x_norm_i[NX_POINTS_PER_LOOP]; |
90 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
91 | 0 | x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM); |
92 | 0 | } |
93 | | |
94 | | // distances and indices |
95 | 0 | simd8float32 min_distances_i[NX_POINTS_PER_LOOP]; |
96 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
97 | 0 | min_distances_i[nx_k] = |
98 | 0 | simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]); |
99 | 0 | } |
100 | |
|
101 | 0 | simd8uint32 min_indices_i[NX_POINTS_PER_LOOP]; |
102 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
103 | 0 | min_indices_i[nx_k] = simd8uint32((uint32_t)0); |
104 | 0 | } |
105 | | |
106 | | // |
107 | 0 | simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7); |
108 | 0 | const simd8uint32 indices_delta = simd8uint32(8); |
109 | | |
110 | | // main loop |
111 | 0 | size_t j = 0; |
112 | 0 | for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) { |
113 | | // compute dot products for NX_POINTS from x and NY_POINTS from y |
114 | | // technically, we're multiplying -2x and y |
115 | 0 | simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP]; |
116 | | |
117 | | // DIM 0 that uses MUL |
118 | 0 | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
119 | 0 | simd8float32 y_i = |
120 | 0 | simd8float32(y_transposed + j + ny_k * 8 + ny * 0); |
121 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
122 | 0 | dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i; |
123 | 0 | } |
124 | 0 | } |
125 | | |
126 | | // other DIMs that use FMA |
127 | 0 | for (size_t dd = 1; dd < DIM; dd++) { |
128 | 0 | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
129 | 0 | simd8float32 y_i = |
130 | 0 | simd8float32(y_transposed + j + ny_k * 8 + ny * dd); |
131 | |
|
132 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
133 | 0 | dp_i[nx_k][ny_k] = |
134 | 0 | fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]); |
135 | 0 | } |
136 | 0 | } |
137 | 0 | } |
138 | | |
139 | | // compute y^2 + (-2x,y) |
140 | 0 | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
141 | 0 | simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8); |
142 | |
|
143 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
144 | 0 | dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr; |
145 | 0 | } |
146 | 0 | } |
147 | | |
148 | | // do the comparisons and alter the min indices |
149 | 0 | for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) { |
150 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
151 | | // cmpps |
152 | 0 | cmplt_and_blend_inplace( |
153 | 0 | dp_i[nx_k][ny_k], |
154 | 0 | current_indices, |
155 | 0 | min_distances_i[nx_k], |
156 | 0 | min_indices_i[nx_k]); |
157 | 0 | } |
158 | |
|
159 | 0 | current_indices = current_indices + indices_delta; |
160 | 0 | } |
161 | 0 | } |
162 | | |
163 | | // dump values and find the minimum distance / minimum index |
164 | 0 | for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) { |
165 | 0 | float min_distances_scalar[8]; |
166 | 0 | uint32_t min_indices_scalar[8]; |
167 | |
|
168 | 0 | min_distances_i[nx_k].storeu(min_distances_scalar); |
169 | 0 | min_indices_i[nx_k].storeu(min_indices_scalar); |
170 | |
|
171 | 0 | float current_min_distance = res.dis_tab[i + nx_k]; |
172 | 0 | uint32_t current_min_index = res.ids_tab[i + nx_k]; |
173 | | |
174 | | // This unusual comparison is needed to maintain the behavior |
175 | | // of the original implementation: if two indices are |
176 | | // represented with equal distance values, then |
177 | | // the index with the min value is returned. |
178 | 0 | for (size_t jv = 0; jv < 8; jv++) { |
179 | | // add missing x_norms[i] |
180 | 0 | float distance_candidate = |
181 | 0 | min_distances_scalar[jv] + x_norm_i[nx_k]; |
182 | | |
183 | | // negative values can occur for identical vectors |
184 | | // due to roundoff errors. |
185 | 0 | if (distance_candidate < 0) { |
186 | 0 | distance_candidate = 0; |
187 | 0 | } |
188 | |
|
189 | 0 | const int64_t index_candidate = min_indices_scalar[jv]; |
190 | |
|
191 | 0 | if (current_min_distance > distance_candidate) { |
192 | 0 | current_min_distance = distance_candidate; |
193 | 0 | current_min_index = index_candidate; |
194 | 0 | } else if ( |
195 | 0 | current_min_distance == distance_candidate && |
196 | 0 | current_min_index > index_candidate) { |
197 | 0 | current_min_index = index_candidate; |
198 | 0 | } |
199 | 0 | } |
200 | | |
201 | | // process leftovers |
202 | 0 | for (size_t j0 = j; j0 < ny; j0++) { |
203 | 0 | const float dp = |
204 | 0 | dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM); |
205 | 0 | float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp; |
206 | | // negative values can occur for identical vectors |
207 | | // due to roundoff errors. |
208 | 0 | if (dis < 0) { |
209 | 0 | dis = 0; |
210 | 0 | } |
211 | |
|
212 | 0 | if (current_min_distance > dis) { |
213 | 0 | current_min_distance = dis; |
214 | 0 | current_min_index = j0; |
215 | 0 | } |
216 | 0 | } |
217 | | |
218 | | // done |
219 | 0 | res.add_result(i + nx_k, current_min_distance, current_min_index); |
220 | 0 | } |
221 | 0 | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm1ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm2ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm3ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm4ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm5ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm6ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm7ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm8ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm9ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm10ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm11ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm8ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm12ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm13ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm14ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm15ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm6ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_16kernelILm16ELm1ELm1EEEvPKfS3_S3_mRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_m |
222 | | |
223 | | template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP> |
224 | | void exhaustive_L2sqr_fused_cmax( |
225 | | const float* const __restrict x, |
226 | | const float* const __restrict y, |
227 | | size_t nx, |
228 | | size_t ny, |
229 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
230 | 0 | const float* __restrict y_norms) { |
231 | | // BLAS does not like empty matrices |
232 | 0 | if (nx == 0 || ny == 0) { |
233 | 0 | return; |
234 | 0 | } |
235 | | |
236 | | // compute norms for y |
237 | 0 | std::unique_ptr<float[]> del2; |
238 | 0 | if (!y_norms) { |
239 | 0 | float* y_norms2 = new float[ny]; |
240 | 0 | del2.reset(y_norms2); |
241 | |
|
242 | 0 | for (size_t i = 0; i < ny; i++) { |
243 | 0 | y_norms2[i] = l2_sqr<DIM>(y + i * DIM); |
244 | 0 | } |
245 | |
|
246 | 0 | y_norms = y_norms2; |
247 | 0 | } |
248 | | |
249 | | // initialize res |
250 | 0 | res.begin_multiple(0, nx); |
251 | | |
252 | | // transpose y |
253 | 0 | std::vector<float> y_transposed(DIM * ny); |
254 | 0 | for (size_t j = 0; j < DIM; j++) { |
255 | 0 | for (size_t i = 0; i < ny; i++) { |
256 | 0 | y_transposed[j * ny + i] = y[j + i * DIM]; |
257 | 0 | } |
258 | 0 | } |
259 | |
|
260 | 0 | const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP; |
261 | | // the main loop. |
262 | 0 | #pragma omp parallel for schedule(dynamic) |
263 | 0 | for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) { |
264 | 0 | kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>( |
265 | 0 | x, y, y_transposed.data(), ny, res, y_norms, i); |
266 | 0 | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_.omp_outlined_debug__ |
267 | |
|
268 | 0 | for (size_t i = nx_p; i < nx; i++) { |
269 | 0 | kernel<DIM, 1, NY_POINTS_PER_LOOP>( |
270 | 0 | x, y, y_transposed.data(), ny, res, y_norms, i); |
271 | 0 | } |
272 | | |
273 | | // Does nothing for Top1BlockResultHandler, but |
274 | | // keeping the call for the consistency. |
275 | 0 | res.end_multiple(); |
276 | 0 | InterruptCallback::check(); |
277 | 0 | } Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm1ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm2ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm3ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm4ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm5ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm6ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm7ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm8ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm9ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm10ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm11ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm12ELm8ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm13ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm14ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm15ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ Unexecuted instantiation: simdlib_based.cpp:_ZN5faiss12_GLOBAL__N_127exhaustive_L2sqr_fused_cmaxILm16ELm6ELm1EEEvPKfS3_mmRNS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEES3_ |
278 | | |
279 | | } // namespace |
280 | | |
281 | | bool exhaustive_L2sqr_fused_cmax_simdlib( |
282 | | const float* x, |
283 | | const float* y, |
284 | | size_t d, |
285 | | size_t nx, |
286 | | size_t ny, |
287 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
288 | 157 | const float* y_norms) { |
289 | | // Process only cases with certain dimensionalities. |
290 | | // An acceptable dimensionality value is limited by the number of |
291 | | // available registers. |
292 | | |
293 | 157 | #define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP) \ |
294 | 157 | case DIM: { \ |
295 | 0 | exhaustive_L2sqr_fused_cmax< \ |
296 | 0 | DIM, \ |
297 | 0 | NX_POINTS_PER_LOOP, \ |
298 | 0 | NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \ |
299 | 0 | return true; \ |
300 | 0 | } |
301 | | |
302 | | // faiss/benchs/bench_quantizer.py was used for benchmarking |
303 | | // and tuning 2nd and 3rd parameters values. |
304 | | // Basically, the larger the values for 2nd and 3rd parameters are, |
305 | | // the faster the execution is, but the more SIMD registers are needed. |
306 | | // This can be compensated with L1 cache, this is why this |
307 | | // code might operate with more registers than available |
308 | | // because of concurrent ports operations for ALU and LOAD/STORE. |
309 | | |
310 | 157 | #if defined(__AVX2__) |
311 | | // It was possible to tweak these parameters on x64 machine. |
312 | 157 | switch (d) { |
313 | 0 | DISPATCH(1, 6, 1) |
314 | 0 | DISPATCH(2, 6, 1) |
315 | 0 | DISPATCH(3, 6, 1) |
316 | 0 | DISPATCH(4, 8, 1) |
317 | 0 | DISPATCH(5, 8, 1) |
318 | 0 | DISPATCH(6, 8, 1) |
319 | 0 | DISPATCH(7, 8, 1) |
320 | 0 | DISPATCH(8, 8, 1) |
321 | 0 | DISPATCH(9, 8, 1) |
322 | 0 | DISPATCH(10, 8, 1) |
323 | 0 | DISPATCH(11, 8, 1) |
324 | 0 | DISPATCH(12, 8, 1) |
325 | 0 | DISPATCH(13, 6, 1) |
326 | 0 | DISPATCH(14, 6, 1) |
327 | 0 | DISPATCH(15, 6, 1) |
328 | 157 | DISPATCH(16, 6, 1) |
329 | 157 | } |
330 | | #else |
331 | | // Please feel free to alter 2nd and 3rd parameters if you have access |
332 | | // to ARM-based machine so that you are able to benchmark this code. |
333 | | // Or to enable other dimensions. |
334 | | switch (d) { |
335 | | DISPATCH(1, 4, 2) |
336 | | DISPATCH(2, 2, 2) |
337 | | DISPATCH(3, 2, 2) |
338 | | DISPATCH(4, 2, 1) |
339 | | DISPATCH(5, 1, 1) |
340 | | DISPATCH(6, 1, 1) |
341 | | DISPATCH(7, 1, 1) |
342 | | DISPATCH(8, 1, 1) |
343 | | } |
344 | | #endif |
345 | | |
346 | 157 | return false; |
347 | 157 | #undef DISPATCH |
348 | 157 | } |
349 | | |
350 | | } // namespace faiss |
351 | | |
352 | | #endif |