Coverage Report

Created: 2025-10-30 14:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/contrib/faiss/faiss/utils/distances.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#include <faiss/utils/distances.h>
9
10
#include <algorithm>
11
#include <cassert>
12
#include <cmath>
13
#include <cstddef>
14
#include <cstdio>
15
#include <cstring>
16
17
#include <omp.h>
18
19
#ifdef __AVX2__
20
#include <immintrin.h>
21
#elif defined(__ARM_FEATURE_SVE)
22
#include <arm_sve.h>
23
#endif
24
25
#include <faiss/impl/AuxIndexStructures.h>
26
#include <faiss/impl/FaissAssert.h>
27
#include <faiss/impl/IDSelector.h>
28
#include <faiss/impl/ResultHandler.h>
29
30
#include <faiss/utils/distances_fused/distances_fused.h>
31
32
#ifndef FINTEGER
33
#define FINTEGER long
34
#endif
35
36
extern "C" {
37
38
/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
39
40
int sgemm_(
41
        const char* transa,
42
        const char* transb,
43
        FINTEGER* m,
44
        FINTEGER* n,
45
        FINTEGER* k,
46
        const float* alpha,
47
        const float* a,
48
        FINTEGER* lda,
49
        const float* b,
50
        FINTEGER* ldb,
51
        float* beta,
52
        float* c,
53
        FINTEGER* ldc);
54
}
55
56
namespace faiss {
57
58
/***************************************************************************
59
 * Matrix/vector ops
60
 ***************************************************************************/
61
62
/* Compute the L2 norm of a set of nx vectors */
63
void fvec_norms_L2(
64
        float* __restrict nr,
65
        const float* __restrict x,
66
        size_t d,
67
0
        size_t nx) {
68
0
#pragma omp parallel for if (nx > 10000)
69
0
    for (int64_t i = 0; i < nx; i++) {
70
0
        nr[i] = sqrtf(fvec_norm_L2sqr(x + i * d, d));
71
0
    }
72
0
}
73
74
void fvec_norms_L2sqr(
75
        float* __restrict nr,
76
        const float* __restrict x,
77
        size_t d,
78
0
        size_t nx) {
79
0
#pragma omp parallel for if (nx > 10000)
80
0
    for (int64_t i = 0; i < nx; i++)
81
0
        nr[i] = fvec_norm_L2sqr(x + i * d, d);
82
0
}
83
84
// The following is a workaround to a problem
85
// in OpenMP in fbcode. The crash occurs
86
// inside OMP when IndexIVFSpectralHash::set_query()
87
// calls fvec_renorm_L2. set_query() is always
88
// calling this function with nx == 1, so even
89
// the omp version should run single threaded,
90
// as per the if condition of the omp pragma.
91
// Instead, the omp version crashes inside OMP.
92
// The workaround below is explicitly branching
93
// off to a codepath without omp.
94
95
#define FVEC_RENORM_L2_IMPL                   \
96
0
    float* __restrict xi = x + i * d;         \
97
0
                                              \
98
0
    float nr = fvec_norm_L2sqr(xi, d);        \
99
0
                                              \
100
0
    if (nr > 0) {                             \
101
0
        size_t j;                             \
102
0
        const float inv_nr = 1.0 / sqrtf(nr); \
103
0
        for (j = 0; j < d; j++)               \
104
0
            xi[j] *= inv_nr;                  \
105
0
    }
106
107
0
void fvec_renorm_L2_noomp(size_t d, size_t nx, float* __restrict x) {
108
0
    for (int64_t i = 0; i < nx; i++) {
109
0
        FVEC_RENORM_L2_IMPL
110
0
    }
111
0
}
112
113
0
void fvec_renorm_L2_omp(size_t d, size_t nx, float* __restrict x) {
114
0
#pragma omp parallel for if (nx > 10000)
115
0
    for (int64_t i = 0; i < nx; i++) {
116
0
        FVEC_RENORM_L2_IMPL
117
0
    }
118
0
}
119
120
0
void fvec_renorm_L2(size_t d, size_t nx, float* __restrict x) {
121
0
    if (nx <= 10000) {
122
0
        fvec_renorm_L2_noomp(d, nx, x);
123
0
    } else {
124
0
        fvec_renorm_L2_omp(d, nx, x);
125
0
    }
126
0
}
127
128
/***************************************************************************
129
 * KNN functions
130
 ***************************************************************************/
131
132
namespace {
133
134
/* Find the nearest neighbors for nx queries in a set of ny vectors */
135
template <class BlockResultHandler>
136
void exhaustive_inner_product_seq(
137
        const float* x,
138
        const float* y,
139
        size_t d,
140
        size_t nx,
141
        size_t ny,
142
3
        BlockResultHandler& res) {
143
3
    using SingleResultHandler =
144
3
            typename BlockResultHandler::SingleResultHandler;
145
3
    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
146
147
3
#pragma omp parallel num_threads(nt)
148
1.50k
    {
149
1.50k
        SingleResultHandler resi(res);
150
1.50k
#pragma omp for
151
1.50k
        for (int64_t i = 0; i < nx; i++) {
152
1.50k
            const float* x_i = x + i * d;
153
1.50k
            const float* y_j = y;
154
155
1.50k
            resi.begin(i);
156
157
1.50k
            for (size_t j = 0; j < ny; j++, y_j += d) {
158
1.50k
                if (!res.is_in_selection(j)) {
159
1.50k
                    continue;
160
1.50k
                }
161
1.50k
                float ip = fvec_inner_product(x_i, y_j, d);
162
1.50k
                resi.add_result(ip, j);
163
1.50k
            }
164
1.50k
            resi.end();
165
1.50k
        }
166
1.50k
    }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Line
Count
Source
148
1.50k
    {
149
1.50k
        SingleResultHandler resi(res);
150
1.50k
#pragma omp for
151
1.50k
        for (int64_t i = 0; i < nx; i++) {
152
1.50k
            const float* x_i = x + i * d;
153
1.50k
            const float* y_j = y;
154
155
1.50k
            resi.begin(i);
156
157
1.50k
            for (size_t j = 0; j < ny; j++, y_j += d) {
158
1.50k
                if (!res.is_in_selection(j)) {
159
1.50k
                    continue;
160
1.50k
                }
161
1.50k
                float ip = fvec_inner_product(x_i, y_j, d);
162
1.50k
                resi.add_result(ip, j);
163
1.50k
            }
164
1.50k
            resi.end();
165
1.50k
        }
166
1.50k
    }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
167
3
}
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Line
Count
Source
142
3
        BlockResultHandler& res) {
143
3
    using SingleResultHandler =
144
3
            typename BlockResultHandler::SingleResultHandler;
145
3
    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
146
147
3
#pragma omp parallel num_threads(nt)
148
3
    {
149
3
        SingleResultHandler resi(res);
150
3
#pragma omp for
151
3
        for (int64_t i = 0; i < nx; i++) {
152
3
            const float* x_i = x + i * d;
153
3
            const float* y_j = y;
154
155
3
            resi.begin(i);
156
157
3
            for (size_t j = 0; j < ny; j++, y_j += d) {
158
3
                if (!res.is_in_selection(j)) {
159
3
                    continue;
160
3
                }
161
3
                float ip = fvec_inner_product(x_i, y_j, d);
162
3
                resi.add_result(ip, j);
163
3
            }
164
3
            resi.end();
165
3
        }
166
3
    }
167
3
}
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_128exhaustive_inner_product_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
168
169
template <class BlockResultHandler>
170
void exhaustive_L2sqr_seq(
171
        const float* x,
172
        const float* y,
173
        size_t d,
174
        size_t nx,
175
        size_t ny,
176
0
        BlockResultHandler& res) {
177
0
    using SingleResultHandler =
178
0
            typename BlockResultHandler::SingleResultHandler;
179
0
    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
180
181
0
#pragma omp parallel num_threads(nt)
182
0
    {
183
0
        SingleResultHandler resi(res);
184
0
#pragma omp for
185
0
        for (int64_t i = 0; i < nx; i++) {
186
0
            const float* x_i = x + i * d;
187
0
            const float* y_j = y;
188
0
            resi.begin(i);
189
0
            for (size_t j = 0; j < ny; j++, y_j += d) {
190
0
                if (!res.is_in_selection(j)) {
191
0
                    continue;
192
0
                }
193
0
                float disij = fvec_L2sqr(x_i, y_j, d);
194
0
                resi.add_result(disij, j);
195
0
            }
196
0
            resi.end();
197
0
        }
198
0
    }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_.omp_outlined_debug__
199
0
}
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_120exhaustive_L2sqr_seqINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
200
201
/** Find the nearest neighbors for nx queries in a set of ny vectors */
202
template <class BlockResultHandler>
203
void exhaustive_inner_product_blas(
204
        const float* x,
205
        const float* y,
206
        size_t d,
207
        size_t nx,
208
        size_t ny,
209
0
        BlockResultHandler& res) {
210
    // BLAS does not like empty matrices
211
0
    if (nx == 0 || ny == 0)
212
0
        return;
213
214
    /* block sizes */
215
0
    const size_t bs_x = distance_compute_blas_query_bs;
216
0
    const size_t bs_y = distance_compute_blas_database_bs;
217
0
    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
218
219
0
    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
220
0
        size_t i1 = i0 + bs_x;
221
0
        if (i1 > nx)
222
0
            i1 = nx;
223
224
0
        res.begin_multiple(i0, i1);
225
226
0
        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
227
0
            size_t j1 = j0 + bs_y;
228
0
            if (j1 > ny)
229
0
                j1 = ny;
230
            /* compute the actual dot products */
231
0
            {
232
0
                float one = 1, zero = 0;
233
0
                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
234
0
                sgemm_("Transpose",
235
0
                       "Not transpose",
236
0
                       &nyi,
237
0
                       &nxi,
238
0
                       &di,
239
0
                       &one,
240
0
                       y + j0 * d,
241
0
                       &di,
242
0
                       x + i0 * d,
243
0
                       &di,
244
0
                       &zero,
245
0
                       ip_block.get(),
246
0
                       &nyi);
247
0
            }
248
249
0
            res.add_results(j0, j1, ip_block.get());
250
0
        }
251
0
        res.end_multiple();
252
0
        InterruptCallback::check();
253
0
    }
254
0
}
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_129exhaustive_inner_product_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_
255
256
// distance correction is an operator that can be applied to transform
257
// the distances
258
template <class BlockResultHandler>
259
void exhaustive_L2sqr_blas_default_impl(
260
        const float* x,
261
        const float* y,
262
        size_t d,
263
        size_t nx,
264
        size_t ny,
265
        BlockResultHandler& res,
266
0
        const float* y_norms = nullptr) {
267
    // BLAS does not like empty matrices
268
0
    if (nx == 0 || ny == 0)
269
0
        return;
270
271
    /* block sizes */
272
0
    const size_t bs_x = distance_compute_blas_query_bs;
273
0
    const size_t bs_y = distance_compute_blas_database_bs;
274
    // const size_t bs_x = 16, bs_y = 16;
275
0
    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
276
0
    std::unique_ptr<float[]> x_norms(new float[nx]);
277
0
    std::unique_ptr<float[]> del2;
278
279
0
    fvec_norms_L2sqr(x_norms.get(), x, d, nx);
280
281
0
    if (!y_norms) {
282
0
        float* y_norms2 = new float[ny];
283
0
        del2.reset(y_norms2);
284
0
        fvec_norms_L2sqr(y_norms2, y, d, ny);
285
0
        y_norms = y_norms2;
286
0
    }
287
288
0
    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
289
0
        size_t i1 = i0 + bs_x;
290
0
        if (i1 > nx)
291
0
            i1 = nx;
292
293
0
        res.begin_multiple(i0, i1);
294
295
0
        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
296
0
            size_t j1 = j0 + bs_y;
297
0
            if (j1 > ny)
298
0
                j1 = ny;
299
            /* compute the actual dot products */
300
0
            {
301
0
                float one = 1, zero = 0;
302
0
                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
303
0
                sgemm_("Transpose",
304
0
                       "Not transpose",
305
0
                       &nyi,
306
0
                       &nxi,
307
0
                       &di,
308
0
                       &one,
309
0
                       y + j0 * d,
310
0
                       &di,
311
0
                       x + i0 * d,
312
0
                       &di,
313
0
                       &zero,
314
0
                       ip_block.get(),
315
0
                       &nyi);
316
0
            }
317
0
#pragma omp parallel for
318
0
            for (int64_t i = i0; i < i1; i++) {
319
0
                float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
320
321
0
                for (size_t j = j0; j < j1; j++) {
322
0
                    float ip = *ip_line;
323
0
                    float dis = x_norms[i] + y_norms[j] - 2 * ip;
324
325
0
                    if (!res.is_in_selection(j)) {
326
0
                        dis = HUGE_VALF;
327
0
                    }
328
                    // negative values can occur for identical vectors
329
                    // due to roundoff errors
330
0
                    if (dis < 0)
331
0
                        dis = 0;
332
333
0
                    *ip_line = dis;
334
0
                    ip_line++;
335
0
                }
336
0
            }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_.omp_outlined_debug__
337
0
            res.add_results(j0, j1, ip_block.get());
338
0
        }
339
0
        res.end_multiple();
340
0
        InterruptCallback::check();
341
0
    }
342
0
}
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_134exhaustive_L2sqr_blas_default_implINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_
343
344
template <class BlockResultHandler>
345
void exhaustive_L2sqr_blas(
346
        const float* x,
347
        const float* y,
348
        size_t d,
349
        size_t nx,
350
        size_t ny,
351
        BlockResultHandler& res,
352
0
        const float* y_norms = nullptr) {
353
0
    exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res);
354
0
}
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvPKfS7_mmmRT_S7_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_121exhaustive_L2sqr_blasINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvPKfS7_mmmRT_S7_
355
356
#ifdef __AVX2__
357
void exhaustive_L2sqr_blas_cmax_avx2(
358
        const float* x,
359
        const float* y,
360
        size_t d,
361
        size_t nx,
362
        size_t ny,
363
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
364
0
        const float* y_norms) {
365
    // BLAS does not like empty matrices
366
0
    if (nx == 0 || ny == 0)
367
0
        return;
368
369
    /* block sizes */
370
0
    const size_t bs_x = distance_compute_blas_query_bs;
371
0
    const size_t bs_y = distance_compute_blas_database_bs;
372
    // const size_t bs_x = 16, bs_y = 16;
373
0
    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
374
0
    std::unique_ptr<float[]> x_norms(new float[nx]);
375
0
    std::unique_ptr<float[]> del2;
376
377
0
    fvec_norms_L2sqr(x_norms.get(), x, d, nx);
378
379
0
    if (!y_norms) {
380
0
        float* y_norms2 = new float[ny];
381
0
        del2.reset(y_norms2);
382
0
        fvec_norms_L2sqr(y_norms2, y, d, ny);
383
0
        y_norms = y_norms2;
384
0
    }
385
386
0
    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
387
0
        size_t i1 = i0 + bs_x;
388
0
        if (i1 > nx)
389
0
            i1 = nx;
390
391
0
        res.begin_multiple(i0, i1);
392
393
0
        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
394
0
            size_t j1 = j0 + bs_y;
395
0
            if (j1 > ny)
396
0
                j1 = ny;
397
            /* compute the actual dot products */
398
0
            {
399
0
                float one = 1, zero = 0;
400
0
                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
401
0
                sgemm_("Transpose",
402
0
                       "Not transpose",
403
0
                       &nyi,
404
0
                       &nxi,
405
0
                       &di,
406
0
                       &one,
407
0
                       y + j0 * d,
408
0
                       &di,
409
0
                       x + i0 * d,
410
0
                       &di,
411
0
                       &zero,
412
0
                       ip_block.get(),
413
0
                       &nyi);
414
0
            }
415
0
#pragma omp parallel for
416
0
            for (int64_t i = i0; i < i1; i++) {
417
0
                float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
418
419
0
                _mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
420
0
                _mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);
421
422
                // constant
423
0
                const __m256 mul_minus2 = _mm256_set1_ps(-2);
424
425
                // Track 8 min distances + 8 min indices.
426
                // All the distances tracked do not take x_norms[i]
427
                //   into account in order to get rid of extra
428
                //   _mm256_add_ps(x_norms[i], ...) instructions
429
                //   is distance computations.
430
0
                __m256 min_distances =
431
0
                        _mm256_set1_ps(res.dis_tab[i] - x_norms[i]);
432
433
                // these indices are local and are relative to j0.
434
                // so, value 0 means j0.
435
0
                __m256i min_indices = _mm256_set1_epi32(0);
436
437
0
                __m256i current_indices =
438
0
                        _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
439
0
                const __m256i indices_delta = _mm256_set1_epi32(8);
440
441
                // current j index
442
0
                size_t idx_j = 0;
443
0
                size_t count = j1 - j0;
444
445
                // process 16 elements per loop
446
0
                for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
447
0
                    _mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
448
0
                    _mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);
449
450
                    // load values for norms
451
0
                    const __m256 y_norm_0 =
452
0
                            _mm256_loadu_ps(y_norms + idx_j + j0 + 0);
453
0
                    const __m256 y_norm_1 =
454
0
                            _mm256_loadu_ps(y_norms + idx_j + j0 + 8);
455
456
                    // load values for dot products
457
0
                    const __m256 ip_0 = _mm256_loadu_ps(ip_line + 0);
458
0
                    const __m256 ip_1 = _mm256_loadu_ps(ip_line + 8);
459
460
                    // compute dis = y_norm[j] - 2 * dot(x_norm[i], y_norm[j]).
461
                    // x_norm[i] was dropped off because it is a constant for a
462
                    // given i. We'll deal with it later.
463
0
                    __m256 distances_0 =
464
0
                            _mm256_fmadd_ps(ip_0, mul_minus2, y_norm_0);
465
0
                    __m256 distances_1 =
466
0
                            _mm256_fmadd_ps(ip_1, mul_minus2, y_norm_1);
467
468
                    // compare the new distances to the min distances
469
                    // for each of the first group of 8 AVX2 components.
470
0
                    const __m256 comparison_0 = _mm256_cmp_ps(
471
0
                            min_distances, distances_0, _CMP_LE_OS);
472
473
                    // update min distances and indices with closest vectors if
474
                    // needed.
475
0
                    min_distances = _mm256_blendv_ps(
476
0
                            distances_0, min_distances, comparison_0);
477
0
                    min_indices = _mm256_castps_si256(_mm256_blendv_ps(
478
0
                            _mm256_castsi256_ps(current_indices),
479
0
                            _mm256_castsi256_ps(min_indices),
480
0
                            comparison_0));
481
0
                    current_indices =
482
0
                            _mm256_add_epi32(current_indices, indices_delta);
483
484
                    // compare the new distances to the min distances
485
                    // for each of the second group of 8 AVX2 components.
486
0
                    const __m256 comparison_1 = _mm256_cmp_ps(
487
0
                            min_distances, distances_1, _CMP_LE_OS);
488
489
                    // update min distances and indices with closest vectors if
490
                    // needed.
491
0
                    min_distances = _mm256_blendv_ps(
492
0
                            distances_1, min_distances, comparison_1);
493
0
                    min_indices = _mm256_castps_si256(_mm256_blendv_ps(
494
0
                            _mm256_castsi256_ps(current_indices),
495
0
                            _mm256_castsi256_ps(min_indices),
496
0
                            comparison_1));
497
0
                    current_indices =
498
0
                            _mm256_add_epi32(current_indices, indices_delta);
499
0
                }
500
501
                // dump values and find the minimum distance / minimum index
502
0
                float min_distances_scalar[8];
503
0
                uint32_t min_indices_scalar[8];
504
0
                _mm256_storeu_ps(min_distances_scalar, min_distances);
505
0
                _mm256_storeu_si256(
506
0
                        (__m256i*)(min_indices_scalar), min_indices);
507
508
0
                float current_min_distance = res.dis_tab[i];
509
0
                uint32_t current_min_index = res.ids_tab[i];
510
511
                // This unusual comparison is needed to maintain the behavior
512
                // of the original implementation: if two indices are
513
                // represented with equal distance values, then
514
                // the index with the min value is returned.
515
0
                for (size_t jv = 0; jv < 8; jv++) {
516
                    // add missing x_norms[i]
517
0
                    float distance_candidate =
518
0
                            min_distances_scalar[jv] + x_norms[i];
519
520
                    // negative values can occur for identical vectors
521
                    //    due to roundoff errors.
522
0
                    if (distance_candidate < 0)
523
0
                        distance_candidate = 0;
524
525
0
                    int64_t index_candidate = min_indices_scalar[jv] + j0;
526
527
0
                    if (current_min_distance > distance_candidate) {
528
0
                        current_min_distance = distance_candidate;
529
0
                        current_min_index = index_candidate;
530
0
                    } else if (
531
0
                            current_min_distance == distance_candidate &&
532
0
                            current_min_index > index_candidate) {
533
0
                        current_min_index = index_candidate;
534
0
                    }
535
0
                }
536
537
                // process leftovers
538
0
                for (; idx_j < count; idx_j++, ip_line++) {
539
0
                    float ip = *ip_line;
540
0
                    float dis = x_norms[i] + y_norms[idx_j + j0] - 2 * ip;
541
                    // negative values can occur for identical vectors
542
                    //    due to roundoff errors.
543
0
                    if (dis < 0)
544
0
                        dis = 0;
545
546
0
                    if (current_min_distance > dis) {
547
0
                        current_min_distance = dis;
548
0
                        current_min_index = idx_j + j0;
549
0
                    }
550
0
                }
551
552
                //
553
0
                res.add_result(i, current_min_distance, current_min_index);
554
0
            }
555
0
        }
556
        // Does nothing for SingleBestResultHandler, but
557
        // keeping the call for the consistency.
558
0
        res.end_multiple();
559
0
        InterruptCallback::check();
560
0
    }
561
0
}
562
#elif defined(__ARM_FEATURE_SVE)
563
void exhaustive_L2sqr_blas_cmax_sve(
564
        const float* x,
565
        const float* y,
566
        size_t d,
567
        size_t nx,
568
        size_t ny,
569
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
570
        const float* y_norms) {
571
    // BLAS does not like empty matrices
572
    if (nx == 0 || ny == 0)
573
        return;
574
575
    /* block sizes */
576
    const size_t bs_x = distance_compute_blas_query_bs;
577
    const size_t bs_y = distance_compute_blas_database_bs;
578
    // const size_t bs_x = 16, bs_y = 16;
579
    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
580
    std::unique_ptr<float[]> x_norms(new float[nx]);
581
    std::unique_ptr<float[]> del2;
582
583
    fvec_norms_L2sqr(x_norms.get(), x, d, nx);
584
585
    const size_t lanes = svcntw();
586
587
    if (!y_norms) {
588
        float* y_norms2 = new float[ny];
589
        del2.reset(y_norms2);
590
        fvec_norms_L2sqr(y_norms2, y, d, ny);
591
        y_norms = y_norms2;
592
    }
593
594
    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
595
        size_t i1 = i0 + bs_x;
596
        if (i1 > nx)
597
            i1 = nx;
598
599
        res.begin_multiple(i0, i1);
600
601
        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
602
            size_t j1 = j0 + bs_y;
603
            if (j1 > ny)
604
                j1 = ny;
605
            /* compute the actual dot products */
606
            {
607
                float one = 1, zero = 0;
608
                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
609
                sgemm_("Transpose",
610
                       "Not transpose",
611
                       &nyi,
612
                       &nxi,
613
                       &di,
614
                       &one,
615
                       y + j0 * d,
616
                       &di,
617
                       x + i0 * d,
618
                       &di,
619
                       &zero,
620
                       ip_block.get(),
621
                       &nyi);
622
            }
623
#pragma omp parallel for
624
            for (int64_t i = i0; i < i1; i++) {
625
                const size_t count = j1 - j0;
626
                float* ip_line = ip_block.get() + (i - i0) * count;
627
628
                svprfw(svwhilelt_b32_u64(0, count), ip_line, SV_PLDL1KEEP);
629
                svprfw(svwhilelt_b32_u64(lanes, count),
630
                       ip_line + lanes,
631
                       SV_PLDL1KEEP);
632
633
                // Track lanes min distances + lanes min indices.
634
                // All the distances tracked do not take x_norms[i]
635
                //   into account in order to get rid of extra
636
                //   vaddq_f32(x_norms[i], ...) instructions
637
                //   is distance computations.
638
                auto min_distances = svdup_n_f32(res.dis_tab[i] - x_norms[i]);
639
640
                // these indices are local and are relative to j0.
641
                // so, value 0 means j0.
642
                auto min_indices = svdup_n_u32(0u);
643
644
                auto current_indices = svindex_u32(0u, 1u);
645
646
                // process lanes * 2 elements per loop
647
                for (size_t idx_j = 0; idx_j < count;
648
                     idx_j += lanes * 2, ip_line += lanes * 2) {
649
                    svprfw(svwhilelt_b32_u64(idx_j + lanes * 2, count),
650
                           ip_line + lanes * 2,
651
                           SV_PLDL1KEEP);
652
                    svprfw(svwhilelt_b32_u64(idx_j + lanes * 3, count),
653
                           ip_line + lanes * 3,
654
                           SV_PLDL1KEEP);
655
656
                    // mask
657
                    const auto mask_0 = svwhilelt_b32_u64(idx_j, count);
658
                    const auto mask_1 = svwhilelt_b32_u64(idx_j + lanes, count);
659
660
                    // load values for norms
661
                    const auto y_norm_0 =
662
                            svld1_f32(mask_0, y_norms + idx_j + j0 + 0);
663
                    const auto y_norm_1 =
664
                            svld1_f32(mask_1, y_norms + idx_j + j0 + lanes);
665
666
                    // load values for dot products
667
                    const auto ip_0 = svld1_f32(mask_0, ip_line + 0);
668
                    const auto ip_1 = svld1_f32(mask_1, ip_line + lanes);
669
670
                    // compute dis = y_norm[j] - 2 * dot(x_norm[i], y_norm[j]).
671
                    // x_norm[i] was dropped off because it is a constant for a
672
                    // given i. We'll deal with it later.
673
                    const auto distances_0 =
674
                            svmla_n_f32_z(mask_0, y_norm_0, ip_0, -2.f);
675
                    const auto distances_1 =
676
                            svmla_n_f32_z(mask_1, y_norm_1, ip_1, -2.f);
677
678
                    // compare the new distances to the min distances
679
                    // for each of the first group of 4 ARM SIMD components.
680
                    auto comparison =
681
                            svcmpgt_f32(mask_0, min_distances, distances_0);
682
683
                    // update min distances and indices with closest vectors if
684
                    // needed.
685
                    min_distances =
686
                            svsel_f32(comparison, distances_0, min_distances);
687
                    min_indices =
688
                            svsel_u32(comparison, current_indices, min_indices);
689
                    current_indices = svadd_n_u32_x(
690
                            mask_0,
691
                            current_indices,
692
                            static_cast<uint32_t>(lanes));
693
694
                    // compare the new distances to the min distances
695
                    // for each of the second group of 4 ARM SIMD components.
696
                    comparison =
697
                            svcmpgt_f32(mask_1, min_distances, distances_1);
698
699
                    // update min distances and indices with closest vectors if
700
                    // needed.
701
                    min_distances =
702
                            svsel_f32(comparison, distances_1, min_distances);
703
                    min_indices =
704
                            svsel_u32(comparison, current_indices, min_indices);
705
                    current_indices = svadd_n_u32_x(
706
                            mask_1,
707
                            current_indices,
708
                            static_cast<uint32_t>(lanes));
709
                }
710
711
                // add missing x_norms[i]
712
                // negative values can occur for identical vectors
713
                //    due to roundoff errors.
714
                auto mask = svwhilelt_b32_u64(0, count);
715
                min_distances = svadd_n_f32_z(
716
                        svcmpge_n_f32(mask, min_distances, -x_norms[i]),
717
                        min_distances,
718
                        x_norms[i]);
719
                min_indices = svadd_n_u32_x(
720
                        mask, min_indices, static_cast<uint32_t>(j0));
721
                mask = svcmple_n_f32(mask, min_distances, res.dis_tab[i]);
722
                if (svcntp_b32(svptrue_b32(), mask) == 0)
723
                    res.add_result(i, res.dis_tab[i], res.ids_tab[i]);
724
                else {
725
                    const auto min_distance = svminv_f32(mask, min_distances);
726
                    const auto min_index = svminv_u32(
727
                            svcmpeq_n_f32(mask, min_distances, min_distance),
728
                            min_indices);
729
                    res.add_result(i, min_distance, min_index);
730
                }
731
            }
732
        }
733
        // Does nothing for SingleBestResultHandler, but
734
        // keeping the call for the consistency.
735
        res.end_multiple();
736
        InterruptCallback::check();
737
    }
738
}
739
#endif
740
741
// an override if only a single closest point is needed
742
template <>
743
void exhaustive_L2sqr_blas<Top1BlockResultHandler<CMax<float, int64_t>>>(
744
        const float* x,
745
        const float* y,
746
        size_t d,
747
        size_t nx,
748
        size_t ny,
749
        Top1BlockResultHandler<CMax<float, int64_t>>& res,
750
0
        const float* y_norms) {
751
0
#if defined(__AVX2__)
752
    // use a faster fused kernel if available
753
0
    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
754
        // the kernel is available and it is complete, we're done.
755
0
        return;
756
0
    }
757
758
    // run the specialized AVX2 implementation
759
0
    exhaustive_L2sqr_blas_cmax_avx2(x, y, d, nx, ny, res, y_norms);
760
761
#elif defined(__ARM_FEATURE_SVE)
762
    // use a faster fused kernel if available
763
    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
764
        // the kernel is available and it is complete, we're done.
765
        return;
766
    }
767
768
    // run the specialized SVE implementation
769
    exhaustive_L2sqr_blas_cmax_sve(x, y, d, nx, ny, res, y_norms);
770
771
#elif defined(__aarch64__)
772
    // use a faster fused kernel if available
773
    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
774
        // the kernel is available and it is complete, we're done.
775
        return;
776
    }
777
778
    // run the default implementation
779
    exhaustive_L2sqr_blas_default_impl<
780
            Top1BlockResultHandler<CMax<float, int64_t>>>(
781
            x, y, d, nx, ny, res, y_norms);
782
#else
783
    // run the default implementation
784
    exhaustive_L2sqr_blas_default_impl<
785
            Top1BlockResultHandler<CMax<float, int64_t>>>(
786
            x, y, d, nx, ny, res, y_norms);
787
#endif
788
0
}
789
790
struct Run_search_inner_product {
791
    using T = void;
792
    template <class BlockResultHandler>
793
    void f(BlockResultHandler& res,
794
           const float* x,
795
           const float* y,
796
           size_t d,
797
           size_t nx,
798
3
           size_t ny) {
799
3
        if (res.sel || nx < distance_compute_blas_threshold) {
800
3
            exhaustive_inner_product_seq(x, y, d, nx, ny, res);
801
3
        } else {
802
0
            exhaustive_inner_product_blas(x, y, d, nx, ny, res);
803
0
        }
804
3
    }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmm
distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmm
Line
Count
Source
798
3
           size_t ny) {
799
3
        if (res.sel || nx < distance_compute_blas_threshold) {
800
3
            exhaustive_inner_product_seq(x, y, d, nx, ny, res);
801
3
        } else {
802
0
            exhaustive_inner_product_blas(x, y, d, nx, ny, res);
803
0
        }
804
3
    }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmm
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_124Run_search_inner_product1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmm
805
};
806
807
struct Run_search_L2sqr {
808
    using T = void;
809
    template <class BlockResultHandler>
810
    void f(BlockResultHandler& res,
811
           const float* x,
812
           const float* y,
813
           size_t d,
814
           size_t nx,
815
           size_t ny,
816
0
           const float* y_norm2) {
817
0
        if (res.sel || nx < distance_compute_blas_threshold) {
818
0
            exhaustive_L2sqr_seq(x, y, d, nx, ny, res);
819
0
        } else {
820
0
            exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2);
821
0
        }
822
0
    }
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22Top1BlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_22HeapBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_27ReservoirBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMinIflEELb0EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb1EEEEEvRT_PKfSA_mmmSA_
Unexecuted instantiation: distances.cpp:_ZN5faiss12_GLOBAL__N_116Run_search_L2sqr1fINS_29RangeSearchBlockResultHandlerINS_4CMaxIflEELb0EEEEEvRT_PKfSA_mmmSA_
823
};
824
825
} // anonymous namespace
826
827
/*******************************************************
828
 * KNN driver functions
829
 *******************************************************/
830
831
int distance_compute_blas_threshold = 20;
832
int distance_compute_blas_query_bs = 4096;
833
int distance_compute_blas_database_bs = 1024;
834
int distance_compute_min_k_reservoir = 100;
835
836
void knn_inner_product(
837
        const float* x,
838
        const float* y,
839
        size_t d,
840
        size_t nx,
841
        size_t ny,
842
        size_t k,
843
        float* vals,
844
        int64_t* ids,
845
0
        const IDSelector* sel) {
846
0
    int64_t imin = 0;
847
0
    if (auto selr = dynamic_cast<const IDSelectorRange*>(sel)) {
848
0
        imin = std::max(selr->imin, int64_t(0));
849
0
        int64_t imax = std::min(selr->imax, int64_t(ny));
850
0
        ny = imax - imin;
851
0
        y += d * imin;
852
0
        sel = nullptr;
853
0
    }
854
0
    if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) {
855
0
        knn_inner_products_by_idx(
856
0
                x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0);
857
0
        return;
858
0
    }
859
860
0
    Run_search_inner_product r;
861
0
    dispatch_knn_ResultHandler(
862
0
            nx, vals, ids, k, METRIC_INNER_PRODUCT, sel, r, x, y, d, nx, ny);
863
864
0
    if (imin != 0) {
865
0
        for (size_t i = 0; i < nx * k; i++) {
866
0
            if (ids[i] >= 0) {
867
0
                ids[i] += imin;
868
0
            }
869
0
        }
870
0
    }
871
0
}
872
873
void knn_inner_product(
874
        const float* x,
875
        const float* y,
876
        size_t d,
877
        size_t nx,
878
        size_t ny,
879
        float_minheap_array_t* res,
880
0
        const IDSelector* sel) {
881
0
    FAISS_THROW_IF_NOT(nx == res->nh);
882
0
    knn_inner_product(x, y, d, nx, ny, res->k, res->val, res->ids, sel);
883
0
}
884
885
void knn_L2sqr(
886
        const float* x,
887
        const float* y,
888
        size_t d,
889
        size_t nx,
890
        size_t ny,
891
        size_t k,
892
        float* vals,
893
        int64_t* ids,
894
        const float* y_norm2,
895
0
        const IDSelector* sel) {
896
0
    int64_t imin = 0;
897
0
    if (auto selr = dynamic_cast<const IDSelectorRange*>(sel)) {
898
0
        imin = std::max(selr->imin, int64_t(0));
899
0
        int64_t imax = std::min(selr->imax, int64_t(ny));
900
0
        ny = imax - imin;
901
0
        y += d * imin;
902
0
        sel = nullptr;
903
0
    }
904
0
    if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) {
905
0
        knn_L2sqr_by_idx(x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0);
906
0
        return;
907
0
    }
908
909
0
    Run_search_L2sqr r;
910
0
    dispatch_knn_ResultHandler(
911
0
            nx, vals, ids, k, METRIC_L2, sel, r, x, y, d, nx, ny, y_norm2);
912
913
0
    if (imin != 0) {
914
0
        for (size_t i = 0; i < nx * k; i++) {
915
0
            if (ids[i] >= 0) {
916
0
                ids[i] += imin;
917
0
            }
918
0
        }
919
0
    }
920
0
}
921
922
void knn_L2sqr(
923
        const float* x,
924
        const float* y,
925
        size_t d,
926
        size_t nx,
927
        size_t ny,
928
        float_maxheap_array_t* res,
929
        const float* y_norm2,
930
0
        const IDSelector* sel) {
931
0
    FAISS_THROW_IF_NOT(res->nh == nx);
932
0
    knn_L2sqr(x, y, d, nx, ny, res->k, res->val, res->ids, y_norm2, sel);
933
0
}
934
935
/***************************************************************************
936
 * Range search
937
 ***************************************************************************/
938
939
// TODO accept a y_norm2 as well
940
void range_search_L2sqr(
941
        const float* x,
942
        const float* y,
943
        size_t d,
944
        size_t nx,
945
        size_t ny,
946
        float radius,
947
        RangeSearchResult* res,
948
0
        const IDSelector* sel) {
949
0
    Run_search_L2sqr r;
950
0
    dispatch_range_ResultHandler(
951
0
            res, radius, METRIC_L2, sel, r, x, y, d, nx, ny, nullptr);
952
0
}
953
954
void range_search_inner_product(
955
        const float* x,
956
        const float* y,
957
        size_t d,
958
        size_t nx,
959
        size_t ny,
960
        float radius,
961
        RangeSearchResult* res,
962
3
        const IDSelector* sel) {
963
3
    Run_search_inner_product r;
964
3
    dispatch_range_ResultHandler(
965
3
            res, radius, METRIC_INNER_PRODUCT, sel, r, x, y, d, nx, ny);
966
3
}
967
968
/***************************************************************************
969
 * compute a subset of  distances
970
 ***************************************************************************/
971
972
/* compute the inner product between x and a subset y of ny vectors,
973
   whose indices are given by idy.  */
974
void fvec_inner_products_by_idx(
975
        float* __restrict ip,
976
        const float* x,
977
        const float* y,
978
        const int64_t* __restrict ids, /* for y vecs */
979
        size_t d,
980
        size_t nx,
981
0
        size_t ny) {
982
0
#pragma omp parallel for
983
0
    for (int64_t j = 0; j < nx; j++) {
984
0
        const int64_t* __restrict idsj = ids + j * ny;
985
0
        const float* xj = x + j * d;
986
0
        float* __restrict ipj = ip + j * ny;
987
0
        for (size_t i = 0; i < ny; i++) {
988
0
            if (idsj[i] < 0) {
989
0
                ipj[i] = -INFINITY;
990
0
            } else {
991
0
                ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d);
992
0
            }
993
0
        }
994
0
    }
995
0
}
996
997
/* compute the inner product between x and a subset y of ny vectors,
998
   whose indices are given by idy.  */
999
void fvec_L2sqr_by_idx(
1000
        float* __restrict dis,
1001
        const float* x,
1002
        const float* y,
1003
        const int64_t* __restrict ids, /* ids of y vecs */
1004
        size_t d,
1005
        size_t nx,
1006
0
        size_t ny) {
1007
0
#pragma omp parallel for
1008
0
    for (int64_t j = 0; j < nx; j++) {
1009
0
        const int64_t* __restrict idsj = ids + j * ny;
1010
0
        const float* xj = x + j * d;
1011
0
        float* __restrict disj = dis + j * ny;
1012
0
        for (size_t i = 0; i < ny; i++) {
1013
0
            if (idsj[i] < 0) {
1014
0
                disj[i] = INFINITY;
1015
0
            } else {
1016
0
                disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d);
1017
0
            }
1018
0
        }
1019
0
    }
1020
0
}
1021
1022
void pairwise_indexed_L2sqr(
1023
        size_t d,
1024
        size_t n,
1025
        const float* x,
1026
        const int64_t* ix,
1027
        const float* y,
1028
        const int64_t* iy,
1029
0
        float* dis) {
1030
0
#pragma omp parallel for if (n > 1)
1031
0
    for (int64_t j = 0; j < n; j++) {
1032
0
        if (ix[j] >= 0 && iy[j] >= 0) {
1033
0
            dis[j] = fvec_L2sqr(x + d * ix[j], y + d * iy[j], d);
1034
0
        } else {
1035
0
            dis[j] = INFINITY;
1036
0
        }
1037
0
    }
1038
0
}
1039
1040
void pairwise_indexed_inner_product(
1041
        size_t d,
1042
        size_t n,
1043
        const float* x,
1044
        const int64_t* ix,
1045
        const float* y,
1046
        const int64_t* iy,
1047
0
        float* dis) {
1048
0
#pragma omp parallel for if (n > 1)
1049
0
    for (int64_t j = 0; j < n; j++) {
1050
0
        if (ix[j] >= 0 && iy[j] >= 0) {
1051
0
            dis[j] = fvec_inner_product(x + d * ix[j], y + d * iy[j], d);
1052
0
        } else {
1053
0
            dis[j] = -INFINITY;
1054
0
        }
1055
0
    }
1056
0
}
1057
1058
/* Find the nearest neighbors for nx queries in a set of ny vectors
1059
   indexed by ids. May be useful for re-ranking a pre-selected vector list */
1060
void knn_inner_products_by_idx(
1061
        const float* x,
1062
        const float* y,
1063
        const int64_t* ids,
1064
        size_t d,
1065
        size_t nx,
1066
        size_t ny,
1067
        size_t nsubset,
1068
        size_t k,
1069
        float* res_vals,
1070
        int64_t* res_ids,
1071
0
        int64_t ld_ids) {
1072
0
    if (ld_ids < 0) {
1073
0
        ld_ids = ny;
1074
0
    }
1075
1076
0
#pragma omp parallel for if (nx > 100)
1077
0
    for (int64_t i = 0; i < nx; i++) {
1078
0
        const float* x_ = x + i * d;
1079
0
        const int64_t* idsi = ids + i * ld_ids;
1080
0
        size_t j;
1081
0
        float* __restrict simi = res_vals + i * k;
1082
0
        int64_t* __restrict idxi = res_ids + i * k;
1083
0
        minheap_heapify(k, simi, idxi);
1084
1085
0
        for (j = 0; j < nsubset; j++) {
1086
0
            if (idsi[j] < 0 || idsi[j] >= ny) {
1087
0
                break;
1088
0
            }
1089
0
            float ip = fvec_inner_product(x_, y + d * idsi[j], d);
1090
1091
0
            if (ip > simi[0]) {
1092
0
                minheap_replace_top(k, simi, idxi, ip, idsi[j]);
1093
0
            }
1094
0
        }
1095
0
        minheap_reorder(k, simi, idxi);
1096
0
    }
1097
0
}
1098
1099
void knn_L2sqr_by_idx(
1100
        const float* x,
1101
        const float* y,
1102
        const int64_t* __restrict ids,
1103
        size_t d,
1104
        size_t nx,
1105
        size_t ny,
1106
        size_t nsubset,
1107
        size_t k,
1108
        float* res_vals,
1109
        int64_t* res_ids,
1110
0
        int64_t ld_ids) {
1111
0
    if (ld_ids < 0) {
1112
0
        ld_ids = ny;
1113
0
    }
1114
0
#pragma omp parallel for if (nx > 100)
1115
0
    for (int64_t i = 0; i < nx; i++) {
1116
0
        const float* x_ = x + i * d;
1117
0
        const int64_t* __restrict idsi = ids + i * ld_ids;
1118
0
        float* __restrict simi = res_vals + i * k;
1119
0
        int64_t* __restrict idxi = res_ids + i * k;
1120
0
        maxheap_heapify(k, simi, idxi);
1121
0
        for (size_t j = 0; j < nsubset; j++) {
1122
0
            if (idsi[j] < 0 || idsi[j] >= ny) {
1123
0
                break;
1124
0
            }
1125
0
            float disij = fvec_L2sqr(x_, y + d * idsi[j], d);
1126
1127
0
            if (disij < simi[0]) {
1128
0
                maxheap_replace_top(k, simi, idxi, disij, idsi[j]);
1129
0
            }
1130
0
        }
1131
0
        maxheap_reorder(k, simi, idxi);
1132
0
    }
1133
0
}
1134
1135
void pairwise_L2sqr(
1136
        int64_t d,
1137
        int64_t nq,
1138
        const float* xq,
1139
        int64_t nb,
1140
        const float* xb,
1141
        float* dis,
1142
        int64_t ldq,
1143
        int64_t ldb,
1144
0
        int64_t ldd) {
1145
0
    if (nq == 0 || nb == 0)
1146
0
        return;
1147
0
    if (ldq == -1)
1148
0
        ldq = d;
1149
0
    if (ldb == -1)
1150
0
        ldb = d;
1151
0
    if (ldd == -1)
1152
0
        ldd = nb;
1153
1154
    // store in beginning of distance matrix to avoid malloc
1155
0
    float* b_norms = dis;
1156
1157
0
#pragma omp parallel for if (nb > 1)
1158
0
    for (int64_t i = 0; i < nb; i++)
1159
0
        b_norms[i] = fvec_norm_L2sqr(xb + i * ldb, d);
1160
1161
0
#pragma omp parallel for
1162
0
    for (int64_t i = 1; i < nq; i++) {
1163
0
        float q_norm = fvec_norm_L2sqr(xq + i * ldq, d);
1164
0
        for (int64_t j = 0; j < nb; j++)
1165
0
            dis[i * ldd + j] = q_norm + b_norms[j];
1166
0
    }
1167
1168
0
    {
1169
0
        float q_norm = fvec_norm_L2sqr(xq, d);
1170
0
        for (int64_t j = 0; j < nb; j++)
1171
0
            dis[j] += q_norm;
1172
0
    }
1173
1174
0
    {
1175
0
        FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1176
0
        float one = 1.0, minus_2 = -2.0;
1177
1178
0
        sgemm_("Transposed",
1179
0
               "Not transposed",
1180
0
               &nbi,
1181
0
               &nqi,
1182
0
               &di,
1183
0
               &minus_2,
1184
0
               xb,
1185
0
               &ldbi,
1186
0
               xq,
1187
0
               &ldqi,
1188
0
               &one,
1189
0
               dis,
1190
0
               &lddi);
1191
0
    }
1192
0
}
1193
1194
void inner_product_to_L2sqr(
1195
        float* __restrict dis,
1196
        const float* nr1,
1197
        const float* nr2,
1198
        size_t n1,
1199
0
        size_t n2) {
1200
0
#pragma omp parallel for
1201
0
    for (int64_t j = 0; j < n1; j++) {
1202
0
        float* disj = dis + j * n2;
1203
0
        for (size_t i = 0; i < n2; i++)
1204
0
            disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1205
0
    }
1206
0
}
1207
1208
} // namespace faiss