/root/doris/contrib/faiss/faiss/utils/distances.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #include <faiss/utils/distances.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cassert> |
12 | | #include <cmath> |
13 | | #include <cstddef> |
14 | | #include <cstdio> |
15 | | #include <cstring> |
16 | | |
17 | | #include <omp.h> |
18 | | |
19 | | #ifdef __AVX2__ |
20 | | #include <immintrin.h> |
21 | | #elif defined(__ARM_FEATURE_SVE) |
22 | | #include <arm_sve.h> |
23 | | #endif |
24 | | |
25 | | #include <faiss/impl/AuxIndexStructures.h> |
26 | | #include <faiss/impl/FaissAssert.h> |
27 | | #include <faiss/impl/IDSelector.h> |
28 | | #include <faiss/impl/ResultHandler.h> |
29 | | |
30 | | #include <faiss/utils/distances_fused/distances_fused.h> |
31 | | |
32 | | #ifndef FINTEGER |
33 | | #define FINTEGER long |
34 | | #endif |
35 | | |
36 | | extern "C" { |
37 | | |
38 | | /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */ |
39 | | |
40 | | int sgemm_( |
41 | | const char* transa, |
42 | | const char* transb, |
43 | | FINTEGER* m, |
44 | | FINTEGER* n, |
45 | | FINTEGER* k, |
46 | | const float* alpha, |
47 | | const float* a, |
48 | | FINTEGER* lda, |
49 | | const float* b, |
50 | | FINTEGER* ldb, |
51 | | float* beta, |
52 | | float* c, |
53 | | FINTEGER* ldc); |
54 | | } |
55 | | |
56 | | namespace faiss { |
57 | | |
58 | | /*************************************************************************** |
59 | | * Matrix/vector ops |
60 | | ***************************************************************************/ |
61 | | |
62 | | /* Compute the L2 norm of a set of nx vectors */ |
63 | | void fvec_norms_L2( |
64 | | float* __restrict nr, |
65 | | const float* __restrict x, |
66 | | size_t d, |
67 | 0 | size_t nx) { |
68 | 0 | #pragma omp parallel for if (nx > 10000) |
69 | 0 | for (int64_t i = 0; i < nx; i++) { |
70 | 0 | nr[i] = sqrtf(fvec_norm_L2sqr(x + i * d, d)); |
71 | 0 | } |
72 | 0 | } |
73 | | |
74 | | void fvec_norms_L2sqr( |
75 | | float* __restrict nr, |
76 | | const float* __restrict x, |
77 | | size_t d, |
78 | 0 | size_t nx) { |
79 | 0 | #pragma omp parallel for if (nx > 10000) |
80 | 0 | for (int64_t i = 0; i < nx; i++) |
81 | 0 | nr[i] = fvec_norm_L2sqr(x + i * d, d); |
82 | 0 | } |
83 | | |
84 | | // The following is a workaround to a problem |
85 | | // in OpenMP in fbcode. The crash occurs |
86 | | // inside OMP when IndexIVFSpectralHash::set_query() |
87 | | // calls fvec_renorm_L2. set_query() is always |
88 | | // calling this function with nx == 1, so even |
89 | | // the omp version should run single threaded, |
90 | | // as per the if condition of the omp pragma. |
91 | | // Instead, the omp version crashes inside OMP. |
92 | | // The workaround below is explicitly branching |
93 | | // off to a codepath without omp. |
94 | | |
95 | | #define FVEC_RENORM_L2_IMPL \ |
96 | 0 | float* __restrict xi = x + i * d; \ |
97 | 0 | \ |
98 | 0 | float nr = fvec_norm_L2sqr(xi, d); \ |
99 | 0 | \ |
100 | 0 | if (nr > 0) { \ |
101 | 0 | size_t j; \ |
102 | 0 | const float inv_nr = 1.0 / sqrtf(nr); \ |
103 | 0 | for (j = 0; j < d; j++) \ |
104 | 0 | xi[j] *= inv_nr; \ |
105 | 0 | } |
106 | | |
107 | 0 | void fvec_renorm_L2_noomp(size_t d, size_t nx, float* __restrict x) { |
108 | 0 | for (int64_t i = 0; i < nx; i++) { |
109 | 0 | FVEC_RENORM_L2_IMPL |
110 | 0 | } |
111 | 0 | } |
112 | | |
113 | 0 | void fvec_renorm_L2_omp(size_t d, size_t nx, float* __restrict x) { |
114 | 0 | #pragma omp parallel for if (nx > 10000) |
115 | 0 | for (int64_t i = 0; i < nx; i++) { |
116 | 0 | FVEC_RENORM_L2_IMPL |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | 0 | void fvec_renorm_L2(size_t d, size_t nx, float* __restrict x) { |
121 | 0 | if (nx <= 10000) { |
122 | 0 | fvec_renorm_L2_noomp(d, nx, x); |
123 | 0 | } else { |
124 | 0 | fvec_renorm_L2_omp(d, nx, x); |
125 | 0 | } |
126 | 0 | } |
127 | | |
128 | | /*************************************************************************** |
129 | | * KNN functions |
130 | | ***************************************************************************/ |
131 | | |
132 | | namespace { |
133 | | |
134 | | /* Find the nearest neighbors for nx queries in a set of ny vectors */ |
135 | | template <class BlockResultHandler> |
136 | | void exhaustive_inner_product_seq( |
137 | | const float* x, |
138 | | const float* y, |
139 | | size_t d, |
140 | | size_t nx, |
141 | | size_t ny, |
142 | 3 | BlockResultHandler& res) { |
143 | 3 | using SingleResultHandler = |
144 | 3 | typename BlockResultHandler::SingleResultHandler; |
145 | 3 | [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads()); |
146 | | |
147 | 3 | #pragma omp parallel num_threads(nt) |
148 | 1.50k | { |
149 | 1.50k | SingleResultHandler resi(res); |
150 | 1.50k | #pragma omp for |
151 | 1.50k | for (int64_t i = 0; i < nx; i++) { |
152 | 1.50k | const float* x_i = x + i * d; |
153 | 1.50k | const float* y_j = y; |
154 | | |
155 | 1.50k | resi.begin(i); |
156 | | |
157 | 1.50k | for (size_t j = 0; j < ny; j++, y_j += d) { |
158 | 1.50k | if (!res.is_in_selection(j)) { |
159 | 1.50k | continue; |
160 | 1.50k | } |
161 | 1.50k | float ip = fvec_inner_product(x_i, y_j, d); |
162 | 1.50k | resi.add_result(ip, j); |
163 | 1.50k | } |
164 | 1.50k | resi.end(); |
165 | 1.50k | } |
166 | 1.50k | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Line | Count | Source | 148 | 1.50k | { | 149 | 1.50k | SingleResultHandler resi(res); | 150 | 1.50k | #pragma omp for | 151 | 1.50k | for (int64_t i = 0; i < nx; i++) { | 152 | 1.50k | const float* x_i = x + i * d; | 153 | 1.50k | const float* y_j = y; | 154 | | | 155 | 1.50k | resi.begin(i); | 156 | | | 157 | 1.50k | for (size_t j = 0; j < ny; j++, y_j += d) { | 158 | 1.50k | if (!res.is_in_selection(j)) { | 159 | 1.50k | continue; | 160 | 1.50k | } | 161 | 1.50k | float ip = fvec_inner_product(x_i, y_j, d); | 162 | 1.50k | resi.add_result(ip, j); | 163 | 1.50k | } | 164 | 1.50k | resi.end(); | 165 | 1.50k | } | 166 | 1.50k | } |
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ |
167 | 3 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Line | Count | Source | 142 | 3 | BlockResultHandler& res) { | 143 | 3 | using SingleResultHandler = | 144 | 3 | typename BlockResultHandler::SingleResultHandler; | 145 | 3 | [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads()); | 146 | | | 147 | 3 | #pragma omp parallel num_threads(nt) | 148 | 3 | { | 149 | 3 | SingleResultHandler resi(res); | 150 | 3 | #pragma omp for | 151 | 3 | for (int64_t i = 0; i < nx; i++) { | 152 | 3 | const float* x_i = x + i * d; | 153 | 3 | const float* y_j = y; | 154 | | | 155 | 3 | resi.begin(i); | 156 | | | 157 | 3 | for (size_t j = 0; j < ny; j++, y_j += d) { | 158 | 3 | if (!res.is_in_selection(j)) { | 159 | 3 | continue; | 160 | 3 | } | 161 | 3 | float ip = fvec_inner_product(x_i, y_j, d); | 162 | 3 | resi.add_result(ip, j); | 163 | 3 | } | 164 | 3 | resi.end(); | 165 | 3 | } | 166 | 3 | } | 167 | 3 | } |
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ |
168 | | |
169 | | template <class BlockResultHandler> |
170 | | void exhaustive_L2sqr_seq( |
171 | | const float* x, |
172 | | const float* y, |
173 | | size_t d, |
174 | | size_t nx, |
175 | | size_t ny, |
176 | 0 | BlockResultHandler& res) { |
177 | 0 | using SingleResultHandler = |
178 | 0 | typename BlockResultHandler::SingleResultHandler; |
179 | 0 | [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads()); |
180 | |
|
181 | 0 | #pragma omp parallel num_threads(nt) |
182 | 0 | { |
183 | 0 | SingleResultHandler resi(res); |
184 | 0 | #pragma omp for |
185 | 0 | for (int64_t i = 0; i < nx; i++) { |
186 | 0 | const float* x_i = x + i * d; |
187 | 0 | const float* y_j = y; |
188 | 0 | resi.begin(i); |
189 | 0 | for (size_t j = 0; j < ny; j++, y_j += d) { |
190 | 0 | if (!res.is_in_selection(j)) { |
191 | 0 | continue; |
192 | 0 | } |
193 | 0 | float disij = fvec_L2sqr(x_i, y_j, d); |
194 | 0 | resi.add_result(disij, j); |
195 | 0 | } |
196 | 0 | resi.end(); |
197 | 0 | } |
198 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__ |
199 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ |
200 | | |
201 | | /** Find the nearest neighbors for nx queries in a set of ny vectors */ |
202 | | template <class BlockResultHandler> |
203 | | void exhaustive_inner_product_blas( |
204 | | const float* x, |
205 | | const float* y, |
206 | | size_t d, |
207 | | size_t nx, |
208 | | size_t ny, |
209 | 0 | BlockResultHandler& res) { |
210 | | // BLAS does not like empty matrices |
211 | 0 | if (nx == 0 || ny == 0) |
212 | 0 | return; |
213 | | |
214 | | /* block sizes */ |
215 | 0 | const size_t bs_x = distance_compute_blas_query_bs; |
216 | 0 | const size_t bs_y = distance_compute_blas_database_bs; |
217 | 0 | std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]); |
218 | |
|
219 | 0 | for (size_t i0 = 0; i0 < nx; i0 += bs_x) { |
220 | 0 | size_t i1 = i0 + bs_x; |
221 | 0 | if (i1 > nx) |
222 | 0 | i1 = nx; |
223 | |
|
224 | 0 | res.begin_multiple(i0, i1); |
225 | |
|
226 | 0 | for (size_t j0 = 0; j0 < ny; j0 += bs_y) { |
227 | 0 | size_t j1 = j0 + bs_y; |
228 | 0 | if (j1 > ny) |
229 | 0 | j1 = ny; |
230 | | /* compute the actual dot products */ |
231 | 0 | { |
232 | 0 | float one = 1, zero = 0; |
233 | 0 | FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d; |
234 | 0 | sgemm_("Transpose", |
235 | 0 | "Not transpose", |
236 | 0 | &nyi, |
237 | 0 | &nxi, |
238 | 0 | &di, |
239 | 0 | &one, |
240 | 0 | y + j0 * d, |
241 | 0 | &di, |
242 | 0 | x + i0 * d, |
243 | 0 | &di, |
244 | 0 | &zero, |
245 | 0 | ip_block.get(), |
246 | 0 | &nyi); |
247 | 0 | } |
248 | |
|
249 | 0 | res.add_results(j0, j1, ip_block.get()); |
250 | 0 | } |
251 | 0 | res.end_multiple(); |
252 | 0 | InterruptCallback::check(); |
253 | 0 | } |
254 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_ |
255 | | |
256 | | // distance correction is an operator that can be applied to transform |
257 | | // the distances |
258 | | template <class BlockResultHandler> |
259 | | void exhaustive_L2sqr_blas_default_impl( |
260 | | const float* x, |
261 | | const float* y, |
262 | | size_t d, |
263 | | size_t nx, |
264 | | size_t ny, |
265 | | BlockResultHandler& res, |
266 | 0 | const float* y_norms = nullptr) { |
267 | | // BLAS does not like empty matrices |
268 | 0 | if (nx == 0 || ny == 0) |
269 | 0 | return; |
270 | | |
271 | | /* block sizes */ |
272 | 0 | const size_t bs_x = distance_compute_blas_query_bs; |
273 | 0 | const size_t bs_y = distance_compute_blas_database_bs; |
274 | | // const size_t bs_x = 16, bs_y = 16; |
275 | 0 | std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]); |
276 | 0 | std::unique_ptr<float[]> x_norms(new float[nx]); |
277 | 0 | std::unique_ptr<float[]> del2; |
278 | |
|
279 | 0 | fvec_norms_L2sqr(x_norms.get(), x, d, nx); |
280 | |
|
281 | 0 | if (!y_norms) { |
282 | 0 | float* y_norms2 = new float[ny]; |
283 | 0 | del2.reset(y_norms2); |
284 | 0 | fvec_norms_L2sqr(y_norms2, y, d, ny); |
285 | 0 | y_norms = y_norms2; |
286 | 0 | } |
287 | |
|
288 | 0 | for (size_t i0 = 0; i0 < nx; i0 += bs_x) { |
289 | 0 | size_t i1 = i0 + bs_x; |
290 | 0 | if (i1 > nx) |
291 | 0 | i1 = nx; |
292 | |
|
293 | 0 | res.begin_multiple(i0, i1); |
294 | |
|
295 | 0 | for (size_t j0 = 0; j0 < ny; j0 += bs_y) { |
296 | 0 | size_t j1 = j0 + bs_y; |
297 | 0 | if (j1 > ny) |
298 | 0 | j1 = ny; |
299 | | /* compute the actual dot products */ |
300 | 0 | { |
301 | 0 | float one = 1, zero = 0; |
302 | 0 | FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d; |
303 | 0 | sgemm_("Transpose", |
304 | 0 | "Not transpose", |
305 | 0 | &nyi, |
306 | 0 | &nxi, |
307 | 0 | &di, |
308 | 0 | &one, |
309 | 0 | y + j0 * d, |
310 | 0 | &di, |
311 | 0 | x + i0 * d, |
312 | 0 | &di, |
313 | 0 | &zero, |
314 | 0 | ip_block.get(), |
315 | 0 | &nyi); |
316 | 0 | } |
317 | 0 | #pragma omp parallel for |
318 | 0 | for (int64_t i = i0; i < i1; i++) { |
319 | 0 | float* ip_line = ip_block.get() + (i - i0) * (j1 - j0); |
320 | |
|
321 | 0 | for (size_t j = j0; j < j1; j++) { |
322 | 0 | float ip = *ip_line; |
323 | 0 | float dis = x_norms[i] + y_norms[j] - 2 * ip; |
324 | |
|
325 | 0 | if (!res.is_in_selection(j)) { |
326 | 0 | dis = HUGE_VALF; |
327 | 0 | } |
328 | | // negative values can occur for identical vectors |
329 | | // due to roundoff errors |
330 | 0 | if (dis < 0) |
331 | 0 | dis = 0; |
332 | |
|
333 | 0 | *ip_line = dis; |
334 | 0 | ip_line++; |
335 | 0 | } |
336 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__ |
337 | 0 | res.add_results(j0, j1, ip_block.get()); |
338 | 0 | } |
339 | 0 | res.end_multiple(); |
340 | 0 | InterruptCallback::check(); |
341 | 0 | } |
342 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_ |
343 | | |
344 | | template <class BlockResultHandler> |
345 | | void exhaustive_L2sqr_blas( |
346 | | const float* x, |
347 | | const float* y, |
348 | | size_t d, |
349 | | size_t nx, |
350 | | size_t ny, |
351 | | BlockResultHandler& res, |
352 | 0 | const float* y_norms = nullptr) { |
353 | 0 | exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res); |
354 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_ |
355 | | |
356 | | #ifdef __AVX2__ |
357 | | void exhaustive_L2sqr_blas_cmax_avx2( |
358 | | const float* x, |
359 | | const float* y, |
360 | | size_t d, |
361 | | size_t nx, |
362 | | size_t ny, |
363 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
364 | 0 | const float* y_norms) { |
365 | | // BLAS does not like empty matrices |
366 | 0 | if (nx == 0 || ny == 0) |
367 | 0 | return; |
368 | | |
369 | | /* block sizes */ |
370 | 0 | const size_t bs_x = distance_compute_blas_query_bs; |
371 | 0 | const size_t bs_y = distance_compute_blas_database_bs; |
372 | | // const size_t bs_x = 16, bs_y = 16; |
373 | 0 | std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]); |
374 | 0 | std::unique_ptr<float[]> x_norms(new float[nx]); |
375 | 0 | std::unique_ptr<float[]> del2; |
376 | |
|
377 | 0 | fvec_norms_L2sqr(x_norms.get(), x, d, nx); |
378 | |
|
379 | 0 | if (!y_norms) { |
380 | 0 | float* y_norms2 = new float[ny]; |
381 | 0 | del2.reset(y_norms2); |
382 | 0 | fvec_norms_L2sqr(y_norms2, y, d, ny); |
383 | 0 | y_norms = y_norms2; |
384 | 0 | } |
385 | |
|
386 | 0 | for (size_t i0 = 0; i0 < nx; i0 += bs_x) { |
387 | 0 | size_t i1 = i0 + bs_x; |
388 | 0 | if (i1 > nx) |
389 | 0 | i1 = nx; |
390 | |
|
391 | 0 | res.begin_multiple(i0, i1); |
392 | |
|
393 | 0 | for (size_t j0 = 0; j0 < ny; j0 += bs_y) { |
394 | 0 | size_t j1 = j0 + bs_y; |
395 | 0 | if (j1 > ny) |
396 | 0 | j1 = ny; |
397 | | /* compute the actual dot products */ |
398 | 0 | { |
399 | 0 | float one = 1, zero = 0; |
400 | 0 | FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d; |
401 | 0 | sgemm_("Transpose", |
402 | 0 | "Not transpose", |
403 | 0 | &nyi, |
404 | 0 | &nxi, |
405 | 0 | &di, |
406 | 0 | &one, |
407 | 0 | y + j0 * d, |
408 | 0 | &di, |
409 | 0 | x + i0 * d, |
410 | 0 | &di, |
411 | 0 | &zero, |
412 | 0 | ip_block.get(), |
413 | 0 | &nyi); |
414 | 0 | } |
415 | 0 | #pragma omp parallel for |
416 | 0 | for (int64_t i = i0; i < i1; i++) { |
417 | 0 | float* ip_line = ip_block.get() + (i - i0) * (j1 - j0); |
418 | |
|
419 | 0 | _mm_prefetch((const char*)ip_line, _MM_HINT_NTA); |
420 | 0 | _mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA); |
421 | | |
422 | | // constant |
423 | 0 | const __m256 mul_minus2 = _mm256_set1_ps(-2); |
424 | | |
425 | | // Track 8 min distances + 8 min indices. |
426 | | // All the distances tracked do not take x_norms[i] |
427 | | // into account in order to get rid of extra |
428 | | // _mm256_add_ps(x_norms[i], ...) instructions |
429 | | // is distance computations. |
430 | 0 | __m256 min_distances = |
431 | 0 | _mm256_set1_ps(res.dis_tab[i] - x_norms[i]); |
432 | | |
433 | | // these indices are local and are relative to j0. |
434 | | // so, value 0 means j0. |
435 | 0 | __m256i min_indices = _mm256_set1_epi32(0); |
436 | |
|
437 | 0 | __m256i current_indices = |
438 | 0 | _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); |
439 | 0 | const __m256i indices_delta = _mm256_set1_epi32(8); |
440 | | |
441 | | // current j index |
442 | 0 | size_t idx_j = 0; |
443 | 0 | size_t count = j1 - j0; |
444 | | |
445 | | // process 16 elements per loop |
446 | 0 | for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) { |
447 | 0 | _mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA); |
448 | 0 | _mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA); |
449 | | |
450 | | // load values for norms |
451 | 0 | const __m256 y_norm_0 = |
452 | 0 | _mm256_loadu_ps(y_norms + idx_j + j0 + 0); |
453 | 0 | const __m256 y_norm_1 = |
454 | 0 | _mm256_loadu_ps(y_norms + idx_j + j0 + 8); |
455 | | |
456 | | // load values for dot products |
457 | 0 | const __m256 ip_0 = _mm256_loadu_ps(ip_line + 0); |
458 | 0 | const __m256 ip_1 = _mm256_loadu_ps(ip_line + 8); |
459 | | |
460 | | // compute dis = y_norm[j] - 2 * dot(x_norm[i], y_norm[j]). |
461 | | // x_norm[i] was dropped off because it is a constant for a |
462 | | // given i. We'll deal with it later. |
463 | 0 | __m256 distances_0 = |
464 | 0 | _mm256_fmadd_ps(ip_0, mul_minus2, y_norm_0); |
465 | 0 | __m256 distances_1 = |
466 | 0 | _mm256_fmadd_ps(ip_1, mul_minus2, y_norm_1); |
467 | | |
468 | | // compare the new distances to the min distances |
469 | | // for each of the first group of 8 AVX2 components. |
470 | 0 | const __m256 comparison_0 = _mm256_cmp_ps( |
471 | 0 | min_distances, distances_0, _CMP_LE_OS); |
472 | | |
473 | | // update min distances and indices with closest vectors if |
474 | | // needed. |
475 | 0 | min_distances = _mm256_blendv_ps( |
476 | 0 | distances_0, min_distances, comparison_0); |
477 | 0 | min_indices = _mm256_castps_si256(_mm256_blendv_ps( |
478 | 0 | _mm256_castsi256_ps(current_indices), |
479 | 0 | _mm256_castsi256_ps(min_indices), |
480 | 0 | comparison_0)); |
481 | 0 | current_indices = |
482 | 0 | _mm256_add_epi32(current_indices, indices_delta); |
483 | | |
484 | | // compare the new distances to the min distances |
485 | | // for each of the second group of 8 AVX2 components. |
486 | 0 | const __m256 comparison_1 = _mm256_cmp_ps( |
487 | 0 | min_distances, distances_1, _CMP_LE_OS); |
488 | | |
489 | | // update min distances and indices with closest vectors if |
490 | | // needed. |
491 | 0 | min_distances = _mm256_blendv_ps( |
492 | 0 | distances_1, min_distances, comparison_1); |
493 | 0 | min_indices = _mm256_castps_si256(_mm256_blendv_ps( |
494 | 0 | _mm256_castsi256_ps(current_indices), |
495 | 0 | _mm256_castsi256_ps(min_indices), |
496 | 0 | comparison_1)); |
497 | 0 | current_indices = |
498 | 0 | _mm256_add_epi32(current_indices, indices_delta); |
499 | 0 | } |
500 | | |
501 | | // dump values and find the minimum distance / minimum index |
502 | 0 | float min_distances_scalar[8]; |
503 | 0 | uint32_t min_indices_scalar[8]; |
504 | 0 | _mm256_storeu_ps(min_distances_scalar, min_distances); |
505 | 0 | _mm256_storeu_si256( |
506 | 0 | (__m256i*)(min_indices_scalar), min_indices); |
507 | |
|
508 | 0 | float current_min_distance = res.dis_tab[i]; |
509 | 0 | uint32_t current_min_index = res.ids_tab[i]; |
510 | | |
511 | | // This unusual comparison is needed to maintain the behavior |
512 | | // of the original implementation: if two indices are |
513 | | // represented with equal distance values, then |
514 | | // the index with the min value is returned. |
515 | 0 | for (size_t jv = 0; jv < 8; jv++) { |
516 | | // add missing x_norms[i] |
517 | 0 | float distance_candidate = |
518 | 0 | min_distances_scalar[jv] + x_norms[i]; |
519 | | |
520 | | // negative values can occur for identical vectors |
521 | | // due to roundoff errors. |
522 | 0 | if (distance_candidate < 0) |
523 | 0 | distance_candidate = 0; |
524 | |
|
525 | 0 | int64_t index_candidate = min_indices_scalar[jv] + j0; |
526 | |
|
527 | 0 | if (current_min_distance > distance_candidate) { |
528 | 0 | current_min_distance = distance_candidate; |
529 | 0 | current_min_index = index_candidate; |
530 | 0 | } else if ( |
531 | 0 | current_min_distance == distance_candidate && |
532 | 0 | current_min_index > index_candidate) { |
533 | 0 | current_min_index = index_candidate; |
534 | 0 | } |
535 | 0 | } |
536 | | |
537 | | // process leftovers |
538 | 0 | for (; idx_j < count; idx_j++, ip_line++) { |
539 | 0 | float ip = *ip_line; |
540 | 0 | float dis = x_norms[i] + y_norms[idx_j + j0] - 2 * ip; |
541 | | // negative values can occur for identical vectors |
542 | | // due to roundoff errors. |
543 | 0 | if (dis < 0) |
544 | 0 | dis = 0; |
545 | |
|
546 | 0 | if (current_min_distance > dis) { |
547 | 0 | current_min_distance = dis; |
548 | 0 | current_min_index = idx_j + j0; |
549 | 0 | } |
550 | 0 | } |
551 | | |
552 | | // |
553 | 0 | res.add_result(i, current_min_distance, current_min_index); |
554 | 0 | } |
555 | 0 | } |
556 | | // Does nothing for SingleBestResultHandler, but |
557 | | // keeping the call for the consistency. |
558 | 0 | res.end_multiple(); |
559 | 0 | InterruptCallback::check(); |
560 | 0 | } |
561 | 0 | } |
562 | | #elif defined(__ARM_FEATURE_SVE) |
563 | | void exhaustive_L2sqr_blas_cmax_sve( |
564 | | const float* x, |
565 | | const float* y, |
566 | | size_t d, |
567 | | size_t nx, |
568 | | size_t ny, |
569 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
570 | | const float* y_norms) { |
571 | | // BLAS does not like empty matrices |
572 | | if (nx == 0 || ny == 0) |
573 | | return; |
574 | | |
575 | | /* block sizes */ |
576 | | const size_t bs_x = distance_compute_blas_query_bs; |
577 | | const size_t bs_y = distance_compute_blas_database_bs; |
578 | | // const size_t bs_x = 16, bs_y = 16; |
579 | | std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]); |
580 | | std::unique_ptr<float[]> x_norms(new float[nx]); |
581 | | std::unique_ptr<float[]> del2; |
582 | | |
583 | | fvec_norms_L2sqr(x_norms.get(), x, d, nx); |
584 | | |
585 | | const size_t lanes = svcntw(); |
586 | | |
587 | | if (!y_norms) { |
588 | | float* y_norms2 = new float[ny]; |
589 | | del2.reset(y_norms2); |
590 | | fvec_norms_L2sqr(y_norms2, y, d, ny); |
591 | | y_norms = y_norms2; |
592 | | } |
593 | | |
594 | | for (size_t i0 = 0; i0 < nx; i0 += bs_x) { |
595 | | size_t i1 = i0 + bs_x; |
596 | | if (i1 > nx) |
597 | | i1 = nx; |
598 | | |
599 | | res.begin_multiple(i0, i1); |
600 | | |
601 | | for (size_t j0 = 0; j0 < ny; j0 += bs_y) { |
602 | | size_t j1 = j0 + bs_y; |
603 | | if (j1 > ny) |
604 | | j1 = ny; |
605 | | /* compute the actual dot products */ |
606 | | { |
607 | | float one = 1, zero = 0; |
608 | | FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d; |
609 | | sgemm_("Transpose", |
610 | | "Not transpose", |
611 | | &nyi, |
612 | | &nxi, |
613 | | &di, |
614 | | &one, |
615 | | y + j0 * d, |
616 | | &di, |
617 | | x + i0 * d, |
618 | | &di, |
619 | | &zero, |
620 | | ip_block.get(), |
621 | | &nyi); |
622 | | } |
623 | | #pragma omp parallel for |
624 | | for (int64_t i = i0; i < i1; i++) { |
625 | | const size_t count = j1 - j0; |
626 | | float* ip_line = ip_block.get() + (i - i0) * count; |
627 | | |
628 | | svprfw(svwhilelt_b32_u64(0, count), ip_line, SV_PLDL1KEEP); |
629 | | svprfw(svwhilelt_b32_u64(lanes, count), |
630 | | ip_line + lanes, |
631 | | SV_PLDL1KEEP); |
632 | | |
633 | | // Track lanes min distances + lanes min indices. |
634 | | // All the distances tracked do not take x_norms[i] |
635 | | // into account in order to get rid of extra |
636 | | // vaddq_f32(x_norms[i], ...) instructions |
637 | | // is distance computations. |
638 | | auto min_distances = svdup_n_f32(res.dis_tab[i] - x_norms[i]); |
639 | | |
640 | | // these indices are local and are relative to j0. |
641 | | // so, value 0 means j0. |
642 | | auto min_indices = svdup_n_u32(0u); |
643 | | |
644 | | auto current_indices = svindex_u32(0u, 1u); |
645 | | |
646 | | // process lanes * 2 elements per loop |
647 | | for (size_t idx_j = 0; idx_j < count; |
648 | | idx_j += lanes * 2, ip_line += lanes * 2) { |
649 | | svprfw(svwhilelt_b32_u64(idx_j + lanes * 2, count), |
650 | | ip_line + lanes * 2, |
651 | | SV_PLDL1KEEP); |
652 | | svprfw(svwhilelt_b32_u64(idx_j + lanes * 3, count), |
653 | | ip_line + lanes * 3, |
654 | | SV_PLDL1KEEP); |
655 | | |
656 | | // mask |
657 | | const auto mask_0 = svwhilelt_b32_u64(idx_j, count); |
658 | | const auto mask_1 = svwhilelt_b32_u64(idx_j + lanes, count); |
659 | | |
660 | | // load values for norms |
661 | | const auto y_norm_0 = |
662 | | svld1_f32(mask_0, y_norms + idx_j + j0 + 0); |
663 | | const auto y_norm_1 = |
664 | | svld1_f32(mask_1, y_norms + idx_j + j0 + lanes); |
665 | | |
666 | | // load values for dot products |
667 | | const auto ip_0 = svld1_f32(mask_0, ip_line + 0); |
668 | | const auto ip_1 = svld1_f32(mask_1, ip_line + lanes); |
669 | | |
670 | | // compute dis = y_norm[j] - 2 * dot(x_norm[i], y_norm[j]). |
671 | | // x_norm[i] was dropped off because it is a constant for a |
672 | | // given i. We'll deal with it later. |
673 | | const auto distances_0 = |
674 | | svmla_n_f32_z(mask_0, y_norm_0, ip_0, -2.f); |
675 | | const auto distances_1 = |
676 | | svmla_n_f32_z(mask_1, y_norm_1, ip_1, -2.f); |
677 | | |
678 | | // compare the new distances to the min distances |
679 | | // for each of the first group of 4 ARM SIMD components. |
680 | | auto comparison = |
681 | | svcmpgt_f32(mask_0, min_distances, distances_0); |
682 | | |
683 | | // update min distances and indices with closest vectors if |
684 | | // needed. |
685 | | min_distances = |
686 | | svsel_f32(comparison, distances_0, min_distances); |
687 | | min_indices = |
688 | | svsel_u32(comparison, current_indices, min_indices); |
689 | | current_indices = svadd_n_u32_x( |
690 | | mask_0, |
691 | | current_indices, |
692 | | static_cast<uint32_t>(lanes)); |
693 | | |
694 | | // compare the new distances to the min distances |
695 | | // for each of the second group of 4 ARM SIMD components. |
696 | | comparison = |
697 | | svcmpgt_f32(mask_1, min_distances, distances_1); |
698 | | |
699 | | // update min distances and indices with closest vectors if |
700 | | // needed. |
701 | | min_distances = |
702 | | svsel_f32(comparison, distances_1, min_distances); |
703 | | min_indices = |
704 | | svsel_u32(comparison, current_indices, min_indices); |
705 | | current_indices = svadd_n_u32_x( |
706 | | mask_1, |
707 | | current_indices, |
708 | | static_cast<uint32_t>(lanes)); |
709 | | } |
710 | | |
711 | | // add missing x_norms[i] |
712 | | // negative values can occur for identical vectors |
713 | | // due to roundoff errors. |
714 | | auto mask = svwhilelt_b32_u64(0, count); |
715 | | min_distances = svadd_n_f32_z( |
716 | | svcmpge_n_f32(mask, min_distances, -x_norms[i]), |
717 | | min_distances, |
718 | | x_norms[i]); |
719 | | min_indices = svadd_n_u32_x( |
720 | | mask, min_indices, static_cast<uint32_t>(j0)); |
721 | | mask = svcmple_n_f32(mask, min_distances, res.dis_tab[i]); |
722 | | if (svcntp_b32(svptrue_b32(), mask) == 0) |
723 | | res.add_result(i, res.dis_tab[i], res.ids_tab[i]); |
724 | | else { |
725 | | const auto min_distance = svminv_f32(mask, min_distances); |
726 | | const auto min_index = svminv_u32( |
727 | | svcmpeq_n_f32(mask, min_distances, min_distance), |
728 | | min_indices); |
729 | | res.add_result(i, min_distance, min_index); |
730 | | } |
731 | | } |
732 | | } |
733 | | // Does nothing for SingleBestResultHandler, but |
734 | | // keeping the call for the consistency. |
735 | | res.end_multiple(); |
736 | | InterruptCallback::check(); |
737 | | } |
738 | | } |
739 | | #endif |
740 | | |
741 | | // an override if only a single closest point is needed |
742 | | template <> |
743 | | void exhaustive_L2sqr_blas<Top1BlockResultHandler<CMax<float, int64_t>>>( |
744 | | const float* x, |
745 | | const float* y, |
746 | | size_t d, |
747 | | size_t nx, |
748 | | size_t ny, |
749 | | Top1BlockResultHandler<CMax<float, int64_t>>& res, |
750 | 0 | const float* y_norms) { |
751 | 0 | #if defined(__AVX2__) |
752 | | // use a faster fused kernel if available |
753 | 0 | if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) { |
754 | | // the kernel is available and it is complete, we're done. |
755 | 0 | return; |
756 | 0 | } |
757 | | |
758 | | // run the specialized AVX2 implementation |
759 | 0 | exhaustive_L2sqr_blas_cmax_avx2(x, y, d, nx, ny, res, y_norms); |
760 | |
|
761 | | #elif defined(__ARM_FEATURE_SVE) |
762 | | // use a faster fused kernel if available |
763 | | if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) { |
764 | | // the kernel is available and it is complete, we're done. |
765 | | return; |
766 | | } |
767 | | |
768 | | // run the specialized SVE implementation |
769 | | exhaustive_L2sqr_blas_cmax_sve(x, y, d, nx, ny, res, y_norms); |
770 | | |
771 | | #elif defined(__aarch64__) |
772 | | // use a faster fused kernel if available |
773 | | if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) { |
774 | | // the kernel is available and it is complete, we're done. |
775 | | return; |
776 | | } |
777 | | |
778 | | // run the default implementation |
779 | | exhaustive_L2sqr_blas_default_impl< |
780 | | Top1BlockResultHandler<CMax<float, int64_t>>>( |
781 | | x, y, d, nx, ny, res, y_norms); |
782 | | #else |
783 | | // run the default implementation |
784 | | exhaustive_L2sqr_blas_default_impl< |
785 | | Top1BlockResultHandler<CMax<float, int64_t>>>( |
786 | | x, y, d, nx, ny, res, y_norms); |
787 | | #endif |
788 | 0 | } |
789 | | |
790 | | struct Run_search_inner_product { |
791 | | using T = void; |
792 | | template <class BlockResultHandler> |
793 | | void f(BlockResultHandler& res, |
794 | | const float* x, |
795 | | const float* y, |
796 | | size_t d, |
797 | | size_t nx, |
798 | 3 | size_t ny) { |
799 | 3 | if (res.sel || nx < distance_compute_blas_threshold) { |
800 | 3 | exhaustive_inner_product_seq(x, y, d, nx, ny, res); |
801 | 3 | } else { |
802 | 0 | exhaustive_inner_product_blas(x, y, d, nx, ny, res); |
803 | 0 | } |
804 | 3 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm Line | Count | Source | 798 | 3 | size_t ny) { | 799 | 3 | if (res.sel || nx < distance_compute_blas_threshold) { | 800 | 3 | exhaustive_inner_product_seq(x, y, d, nx, ny, res); | 801 | 3 | } else { | 802 | 0 | exhaustive_inner_product_blas(x, y, d, nx, ny, res); | 803 | 0 | } | 804 | 3 | } |
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm |
805 | | }; |
806 | | |
807 | | struct Run_search_L2sqr { |
808 | | using T = void; |
809 | | template <class BlockResultHandler> |
810 | | void f(BlockResultHandler& res, |
811 | | const float* x, |
812 | | const float* y, |
813 | | size_t d, |
814 | | size_t nx, |
815 | | size_t ny, |
816 | 0 | const float* y_norm2) { |
817 | 0 | if (res.sel || nx < distance_compute_blas_threshold) { |
818 | 0 | exhaustive_L2sqr_seq(x, y, d, nx, ny, res); |
819 | 0 | } else { |
820 | 0 | exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2); |
821 | 0 | } |
822 | 0 | } Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_ Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_ |
823 | | }; |
824 | | |
825 | | } // anonymous namespace |
826 | | |
827 | | /******************************************************* |
828 | | * KNN driver functions |
829 | | *******************************************************/ |
830 | | |
831 | | int distance_compute_blas_threshold = 20; |
832 | | int distance_compute_blas_query_bs = 4096; |
833 | | int distance_compute_blas_database_bs = 1024; |
834 | | int distance_compute_min_k_reservoir = 100; |
835 | | |
836 | | void knn_inner_product( |
837 | | const float* x, |
838 | | const float* y, |
839 | | size_t d, |
840 | | size_t nx, |
841 | | size_t ny, |
842 | | size_t k, |
843 | | float* vals, |
844 | | int64_t* ids, |
845 | 0 | const IDSelector* sel) { |
846 | 0 | int64_t imin = 0; |
847 | 0 | if (auto selr = dynamic_cast<const IDSelectorRange*>(sel)) { |
848 | 0 | imin = std::max(selr->imin, int64_t(0)); |
849 | 0 | int64_t imax = std::min(selr->imax, int64_t(ny)); |
850 | 0 | ny = imax - imin; |
851 | 0 | y += d * imin; |
852 | 0 | sel = nullptr; |
853 | 0 | } |
854 | 0 | if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) { |
855 | 0 | knn_inner_products_by_idx( |
856 | 0 | x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0); |
857 | 0 | return; |
858 | 0 | } |
859 | | |
860 | 0 | Run_search_inner_product r; |
861 | 0 | dispatch_knn_ResultHandler( |
862 | 0 | nx, vals, ids, k, METRIC_INNER_PRODUCT, sel, r, x, y, d, nx, ny); |
863 | |
|
864 | 0 | if (imin != 0) { |
865 | 0 | for (size_t i = 0; i < nx * k; i++) { |
866 | 0 | if (ids[i] >= 0) { |
867 | 0 | ids[i] += imin; |
868 | 0 | } |
869 | 0 | } |
870 | 0 | } |
871 | 0 | } |
872 | | |
873 | | void knn_inner_product( |
874 | | const float* x, |
875 | | const float* y, |
876 | | size_t d, |
877 | | size_t nx, |
878 | | size_t ny, |
879 | | float_minheap_array_t* res, |
880 | 0 | const IDSelector* sel) { |
881 | 0 | FAISS_THROW_IF_NOT(nx == res->nh); |
882 | 0 | knn_inner_product(x, y, d, nx, ny, res->k, res->val, res->ids, sel); |
883 | 0 | } |
884 | | |
885 | | void knn_L2sqr( |
886 | | const float* x, |
887 | | const float* y, |
888 | | size_t d, |
889 | | size_t nx, |
890 | | size_t ny, |
891 | | size_t k, |
892 | | float* vals, |
893 | | int64_t* ids, |
894 | | const float* y_norm2, |
895 | 0 | const IDSelector* sel) { |
896 | 0 | int64_t imin = 0; |
897 | 0 | if (auto selr = dynamic_cast<const IDSelectorRange*>(sel)) { |
898 | 0 | imin = std::max(selr->imin, int64_t(0)); |
899 | 0 | int64_t imax = std::min(selr->imax, int64_t(ny)); |
900 | 0 | ny = imax - imin; |
901 | 0 | y += d * imin; |
902 | 0 | sel = nullptr; |
903 | 0 | } |
904 | 0 | if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) { |
905 | 0 | knn_L2sqr_by_idx(x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0); |
906 | 0 | return; |
907 | 0 | } |
908 | | |
909 | 0 | Run_search_L2sqr r; |
910 | 0 | dispatch_knn_ResultHandler( |
911 | 0 | nx, vals, ids, k, METRIC_L2, sel, r, x, y, d, nx, ny, y_norm2); |
912 | |
|
913 | 0 | if (imin != 0) { |
914 | 0 | for (size_t i = 0; i < nx * k; i++) { |
915 | 0 | if (ids[i] >= 0) { |
916 | 0 | ids[i] += imin; |
917 | 0 | } |
918 | 0 | } |
919 | 0 | } |
920 | 0 | } |
921 | | |
922 | | void knn_L2sqr( |
923 | | const float* x, |
924 | | const float* y, |
925 | | size_t d, |
926 | | size_t nx, |
927 | | size_t ny, |
928 | | float_maxheap_array_t* res, |
929 | | const float* y_norm2, |
930 | 0 | const IDSelector* sel) { |
931 | 0 | FAISS_THROW_IF_NOT(res->nh == nx); |
932 | 0 | knn_L2sqr(x, y, d, nx, ny, res->k, res->val, res->ids, y_norm2, sel); |
933 | 0 | } |
934 | | |
935 | | /*************************************************************************** |
936 | | * Range search |
937 | | ***************************************************************************/ |
938 | | |
939 | | // TODO accept a y_norm2 as well |
940 | | void range_search_L2sqr( |
941 | | const float* x, |
942 | | const float* y, |
943 | | size_t d, |
944 | | size_t nx, |
945 | | size_t ny, |
946 | | float radius, |
947 | | RangeSearchResult* res, |
948 | 0 | const IDSelector* sel) { |
949 | 0 | Run_search_L2sqr r; |
950 | 0 | dispatch_range_ResultHandler( |
951 | 0 | res, radius, METRIC_L2, sel, r, x, y, d, nx, ny, nullptr); |
952 | 0 | } |
953 | | |
954 | | void range_search_inner_product( |
955 | | const float* x, |
956 | | const float* y, |
957 | | size_t d, |
958 | | size_t nx, |
959 | | size_t ny, |
960 | | float radius, |
961 | | RangeSearchResult* res, |
962 | 3 | const IDSelector* sel) { |
963 | 3 | Run_search_inner_product r; |
964 | 3 | dispatch_range_ResultHandler( |
965 | 3 | res, radius, METRIC_INNER_PRODUCT, sel, r, x, y, d, nx, ny); |
966 | 3 | } |
967 | | |
968 | | /*************************************************************************** |
969 | | * compute a subset of distances |
970 | | ***************************************************************************/ |
971 | | |
972 | | /* compute the inner product between x and a subset y of ny vectors, |
973 | | whose indices are given by idy. */ |
974 | | void fvec_inner_products_by_idx( |
975 | | float* __restrict ip, |
976 | | const float* x, |
977 | | const float* y, |
978 | | const int64_t* __restrict ids, /* for y vecs */ |
979 | | size_t d, |
980 | | size_t nx, |
981 | 0 | size_t ny) { |
982 | 0 | #pragma omp parallel for |
983 | 0 | for (int64_t j = 0; j < nx; j++) { |
984 | 0 | const int64_t* __restrict idsj = ids + j * ny; |
985 | 0 | const float* xj = x + j * d; |
986 | 0 | float* __restrict ipj = ip + j * ny; |
987 | 0 | for (size_t i = 0; i < ny; i++) { |
988 | 0 | if (idsj[i] < 0) { |
989 | 0 | ipj[i] = -INFINITY; |
990 | 0 | } else { |
991 | 0 | ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d); |
992 | 0 | } |
993 | 0 | } |
994 | 0 | } |
995 | 0 | } |
996 | | |
997 | | /* compute the inner product between x and a subset y of ny vectors, |
998 | | whose indices are given by idy. */ |
999 | | void fvec_L2sqr_by_idx( |
1000 | | float* __restrict dis, |
1001 | | const float* x, |
1002 | | const float* y, |
1003 | | const int64_t* __restrict ids, /* ids of y vecs */ |
1004 | | size_t d, |
1005 | | size_t nx, |
1006 | 0 | size_t ny) { |
1007 | 0 | #pragma omp parallel for |
1008 | 0 | for (int64_t j = 0; j < nx; j++) { |
1009 | 0 | const int64_t* __restrict idsj = ids + j * ny; |
1010 | 0 | const float* xj = x + j * d; |
1011 | 0 | float* __restrict disj = dis + j * ny; |
1012 | 0 | for (size_t i = 0; i < ny; i++) { |
1013 | 0 | if (idsj[i] < 0) { |
1014 | 0 | disj[i] = INFINITY; |
1015 | 0 | } else { |
1016 | 0 | disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d); |
1017 | 0 | } |
1018 | 0 | } |
1019 | 0 | } |
1020 | 0 | } |
1021 | | |
1022 | | void pairwise_indexed_L2sqr( |
1023 | | size_t d, |
1024 | | size_t n, |
1025 | | const float* x, |
1026 | | const int64_t* ix, |
1027 | | const float* y, |
1028 | | const int64_t* iy, |
1029 | 0 | float* dis) { |
1030 | 0 | #pragma omp parallel for if (n > 1) |
1031 | 0 | for (int64_t j = 0; j < n; j++) { |
1032 | 0 | if (ix[j] >= 0 && iy[j] >= 0) { |
1033 | 0 | dis[j] = fvec_L2sqr(x + d * ix[j], y + d * iy[j], d); |
1034 | 0 | } else { |
1035 | 0 | dis[j] = INFINITY; |
1036 | 0 | } |
1037 | 0 | } |
1038 | 0 | } |
1039 | | |
1040 | | void pairwise_indexed_inner_product( |
1041 | | size_t d, |
1042 | | size_t n, |
1043 | | const float* x, |
1044 | | const int64_t* ix, |
1045 | | const float* y, |
1046 | | const int64_t* iy, |
1047 | 0 | float* dis) { |
1048 | 0 | #pragma omp parallel for if (n > 1) |
1049 | 0 | for (int64_t j = 0; j < n; j++) { |
1050 | 0 | if (ix[j] >= 0 && iy[j] >= 0) { |
1051 | 0 | dis[j] = fvec_inner_product(x + d * ix[j], y + d * iy[j], d); |
1052 | 0 | } else { |
1053 | 0 | dis[j] = -INFINITY; |
1054 | 0 | } |
1055 | 0 | } |
1056 | 0 | } |
1057 | | |
1058 | | /* Find the nearest neighbors for nx queries in a set of ny vectors |
1059 | | indexed by ids. May be useful for re-ranking a pre-selected vector list */ |
1060 | | void knn_inner_products_by_idx( |
1061 | | const float* x, |
1062 | | const float* y, |
1063 | | const int64_t* ids, |
1064 | | size_t d, |
1065 | | size_t nx, |
1066 | | size_t ny, |
1067 | | size_t nsubset, |
1068 | | size_t k, |
1069 | | float* res_vals, |
1070 | | int64_t* res_ids, |
1071 | 0 | int64_t ld_ids) { |
1072 | 0 | if (ld_ids < 0) { |
1073 | 0 | ld_ids = ny; |
1074 | 0 | } |
1075 | |
|
1076 | 0 | #pragma omp parallel for if (nx > 100) |
1077 | 0 | for (int64_t i = 0; i < nx; i++) { |
1078 | 0 | const float* x_ = x + i * d; |
1079 | 0 | const int64_t* idsi = ids + i * ld_ids; |
1080 | 0 | size_t j; |
1081 | 0 | float* __restrict simi = res_vals + i * k; |
1082 | 0 | int64_t* __restrict idxi = res_ids + i * k; |
1083 | 0 | minheap_heapify(k, simi, idxi); |
1084 | |
|
1085 | 0 | for (j = 0; j < nsubset; j++) { |
1086 | 0 | if (idsi[j] < 0 || idsi[j] >= ny) { |
1087 | 0 | break; |
1088 | 0 | } |
1089 | 0 | float ip = fvec_inner_product(x_, y + d * idsi[j], d); |
1090 | |
|
1091 | 0 | if (ip > simi[0]) { |
1092 | 0 | minheap_replace_top(k, simi, idxi, ip, idsi[j]); |
1093 | 0 | } |
1094 | 0 | } |
1095 | 0 | minheap_reorder(k, simi, idxi); |
1096 | 0 | } |
1097 | 0 | } |
1098 | | |
1099 | | void knn_L2sqr_by_idx( |
1100 | | const float* x, |
1101 | | const float* y, |
1102 | | const int64_t* __restrict ids, |
1103 | | size_t d, |
1104 | | size_t nx, |
1105 | | size_t ny, |
1106 | | size_t nsubset, |
1107 | | size_t k, |
1108 | | float* res_vals, |
1109 | | int64_t* res_ids, |
1110 | 0 | int64_t ld_ids) { |
1111 | 0 | if (ld_ids < 0) { |
1112 | 0 | ld_ids = ny; |
1113 | 0 | } |
1114 | 0 | #pragma omp parallel for if (nx > 100) |
1115 | 0 | for (int64_t i = 0; i < nx; i++) { |
1116 | 0 | const float* x_ = x + i * d; |
1117 | 0 | const int64_t* __restrict idsi = ids + i * ld_ids; |
1118 | 0 | float* __restrict simi = res_vals + i * k; |
1119 | 0 | int64_t* __restrict idxi = res_ids + i * k; |
1120 | 0 | maxheap_heapify(k, simi, idxi); |
1121 | 0 | for (size_t j = 0; j < nsubset; j++) { |
1122 | 0 | if (idsi[j] < 0 || idsi[j] >= ny) { |
1123 | 0 | break; |
1124 | 0 | } |
1125 | 0 | float disij = fvec_L2sqr(x_, y + d * idsi[j], d); |
1126 | |
|
1127 | 0 | if (disij < simi[0]) { |
1128 | 0 | maxheap_replace_top(k, simi, idxi, disij, idsi[j]); |
1129 | 0 | } |
1130 | 0 | } |
1131 | 0 | maxheap_reorder(k, simi, idxi); |
1132 | 0 | } |
1133 | 0 | } |
1134 | | |
1135 | | void pairwise_L2sqr( |
1136 | | int64_t d, |
1137 | | int64_t nq, |
1138 | | const float* xq, |
1139 | | int64_t nb, |
1140 | | const float* xb, |
1141 | | float* dis, |
1142 | | int64_t ldq, |
1143 | | int64_t ldb, |
1144 | 0 | int64_t ldd) { |
1145 | 0 | if (nq == 0 || nb == 0) |
1146 | 0 | return; |
1147 | 0 | if (ldq == -1) |
1148 | 0 | ldq = d; |
1149 | 0 | if (ldb == -1) |
1150 | 0 | ldb = d; |
1151 | 0 | if (ldd == -1) |
1152 | 0 | ldd = nb; |
1153 | | |
1154 | | // store in beginning of distance matrix to avoid malloc |
1155 | 0 | float* b_norms = dis; |
1156 | |
|
1157 | 0 | #pragma omp parallel for if (nb > 1) |
1158 | 0 | for (int64_t i = 0; i < nb; i++) |
1159 | 0 | b_norms[i] = fvec_norm_L2sqr(xb + i * ldb, d); |
1160 | |
|
1161 | 0 | #pragma omp parallel for |
1162 | 0 | for (int64_t i = 1; i < nq; i++) { |
1163 | 0 | float q_norm = fvec_norm_L2sqr(xq + i * ldq, d); |
1164 | 0 | for (int64_t j = 0; j < nb; j++) |
1165 | 0 | dis[i * ldd + j] = q_norm + b_norms[j]; |
1166 | 0 | } |
1167 | |
|
1168 | 0 | { |
1169 | 0 | float q_norm = fvec_norm_L2sqr(xq, d); |
1170 | 0 | for (int64_t j = 0; j < nb; j++) |
1171 | 0 | dis[j] += q_norm; |
1172 | 0 | } |
1173 | |
|
1174 | 0 | { |
1175 | 0 | FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd; |
1176 | 0 | float one = 1.0, minus_2 = -2.0; |
1177 | |
|
1178 | 0 | sgemm_("Transposed", |
1179 | 0 | "Not transposed", |
1180 | 0 | &nbi, |
1181 | 0 | &nqi, |
1182 | 0 | &di, |
1183 | 0 | &minus_2, |
1184 | 0 | xb, |
1185 | 0 | &ldbi, |
1186 | 0 | xq, |
1187 | 0 | &ldqi, |
1188 | 0 | &one, |
1189 | 0 | dis, |
1190 | 0 | &lddi); |
1191 | 0 | } |
1192 | 0 | } |
1193 | | |
1194 | | void inner_product_to_L2sqr( |
1195 | | float* __restrict dis, |
1196 | | const float* nr1, |
1197 | | const float* nr2, |
1198 | | size_t n1, |
1199 | 0 | size_t n2) { |
1200 | 0 | #pragma omp parallel for |
1201 | 0 | for (int64_t j = 0; j < n1; j++) { |
1202 | 0 | float* disj = dis + j * n2; |
1203 | 0 | for (size_t i = 0; i < n2; i++) |
1204 | 0 | disj[i] = nr1[j] + nr2[i] - 2 * disj[i]; |
1205 | 0 | } |
1206 | 0 | } |
1207 | | |
1208 | | } // namespace faiss |