Coverage Report

Created: 2025-11-26 16:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/contrib/faiss/faiss/impl/HNSW.h
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#pragma once
9
10
#include <queue>
11
#include <unordered_set>
12
#include <vector>
13
14
#include <omp.h>
15
16
#include <faiss/Index.h>
17
#include <faiss/impl/FaissAssert.h>
18
#include <faiss/impl/maybe_owned_vector.h>
19
#include <faiss/impl/platform_macros.h>
20
#include <faiss/utils/Heap.h>
21
#include <faiss/utils/random.h>
22
23
namespace faiss {
24
25
/** Implementation of the Hierarchical Navigable Small World
26
 * datastructure.
27
 *
28
 * Efficient and robust approximate nearest neighbor search using
29
 * Hierarchical Navigable Small World graphs
30
 *
31
 *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
32
 *
33
 * This implementation is heavily influenced by the NMSlib
34
 * implementation by Yury Malkov and Leonid Boystov
35
 * (https://github.com/searchivarius/nmslib)
36
 *
37
 * The HNSW object stores only the neighbor link structure, see
38
 * IndexHNSW.h for the full index object.
39
 */
40
41
struct VisitedTable;
42
struct DistanceComputer; // from AuxIndexStructures
43
struct HNSWStats;
44
template <class C>
45
struct ResultHandler;
46
47
struct SearchParametersHNSW : SearchParameters {
48
    int efSearch = 16;
49
    bool check_relative_distance = true;
50
    bool bounded_queue = true;
51
52
100
    ~SearchParametersHNSW() {}
53
};
54
55
struct HNSW {
56
    /// internal storage of vectors (32 bits: this is expensive)
57
    using storage_idx_t = int32_t;
58
59
    // for now we do only these distances
60
    using C = CMax<float, int64_t>;
61
62
    typedef std::pair<float, storage_idx_t> Node;
63
64
    /** Heap structure that allows fast
65
     */
66
    struct MinimaxHeap {
67
        int n;
68
        int k;
69
        int nvalid;
70
71
        std::vector<storage_idx_t> ids;
72
        std::vector<float> dis;
73
        typedef faiss::CMax<float, storage_idx_t> HC;
74
75
97
        explicit MinimaxHeap(int n) : n(n), k(0), nvalid(0), ids(n), dis(n) {}
76
77
        void push(storage_idx_t i, float v);
78
79
        float max() const;
80
81
        int size() const;
82
83
        void clear();
84
85
        int pop_min(float* vmin_out = nullptr);
86
87
        int count_below(float thresh);
88
    };
89
90
    /// to sort pairs of (id, distance) from nearest to fathest or the reverse
91
    struct NodeDistCloser {
92
        float d;
93
        int id;
94
3.42M
        NodeDistCloser(float d, int id) : d(d), id(id) {}
95
22.0M
        bool operator<(const NodeDistCloser& obj1) const {
96
22.0M
            return d < obj1.d;
97
22.0M
        }
98
    };
99
100
    struct NodeDistFarther {
101
        float d;
102
        int id;
103
3.52M
        NodeDistFarther(float d, int id) : d(d), id(id) {}
104
20.8M
        bool operator<(const NodeDistFarther& obj1) const {
105
20.8M
            return d > obj1.d;
106
20.8M
        }
107
    };
108
109
    /// assignment probability to each layer (sum=1)
110
    std::vector<double> assign_probas;
111
112
    /// number of neighbors stored per layer (cumulative), should not
113
    /// be changed after first add
114
    std::vector<int> cum_nneighbor_per_level;
115
116
    /// level of each vector (base level = 1), size = ntotal
117
    std::vector<int> levels;
118
119
    /// offsets[i] is the offset in the neighbors array where vector i is stored
120
    /// size ntotal + 1
121
    std::vector<size_t> offsets;
122
123
    /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
124
    /// for all levels. this is where all storage goes.
125
    MaybeOwnedVector<storage_idx_t> neighbors;
126
127
    /// entry point in the search structure (one of the points with maximum
128
    /// level
129
    storage_idx_t entry_point = -1;
130
131
    faiss::RandomGenerator rng;
132
133
    /// maximum level
134
    int max_level = -1;
135
136
    /// expansion factor at construction time
137
    int efConstruction = 40;
138
139
    /// expansion factor at search time
140
    int efSearch = 16;
141
142
    /// during search: do we check whether the next best distance is good
143
    /// enough?
144
    bool check_relative_distance = true;
145
146
    /// use bounded queue during exploration
147
    bool search_bounded_queue = true;
148
149
    // methods that initialize the tree sizes
150
151
    /// initialize the assign_probas and cum_nneighbor_per_level to
152
    /// have 2*M links on level 0 and M links on levels > 0
153
    void set_default_probas(int M, float levelMult);
154
155
    /// set nb of neighbors for this level (before adding anything)
156
    void set_nb_neighbors(int level_no, int n);
157
158
    // methods that access the tree sizes
159
160
    /// nb of neighbors for this level
161
    int nb_neighbors(int layer_no) const;
162
163
    /// cumumlative nb up to (and excluding) this level
164
    int cum_nb_neighbors(int layer_no) const;
165
166
    /// range of entries in the neighbors table of vertex no at layer_no
167
    void neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end)
168
            const;
169
170
    /// only mandatory parameter: nb of neighbors
171
    explicit HNSW(int M = 32);
172
173
    /// pick a random level for a new point
174
    int random_level();
175
176
    /// add n random levels to table (for debugging...)
177
    void fill_with_random_links(size_t n);
178
179
    void add_links_starting_from(
180
            DistanceComputer& ptdis,
181
            storage_idx_t pt_id,
182
            storage_idx_t nearest,
183
            float d_nearest,
184
            int level,
185
            omp_lock_t* locks,
186
            VisitedTable& vt,
187
            bool keep_max_size_level0 = false);
188
189
    /** add point pt_id on all levels <= pt_level and build the link
190
     * structure for them. */
191
    void add_with_locks(
192
            DistanceComputer& ptdis,
193
            int pt_level,
194
            int pt_id,
195
            std::vector<omp_lock_t>& locks,
196
            VisitedTable& vt,
197
            bool keep_max_size_level0 = false);
198
199
    /// search interface for 1 point, single thread
200
    HNSWStats search(
201
            DistanceComputer& qdis,
202
            ResultHandler<C>& res,
203
            VisitedTable& vt,
204
            const SearchParameters* params = nullptr) const;
205
206
    /// search only in level 0 from a given vertex
207
    void search_level_0(
208
            DistanceComputer& qdis,
209
            ResultHandler<C>& res,
210
            idx_t nprobe,
211
            const storage_idx_t* nearest_i,
212
            const float* nearest_d,
213
            int search_type,
214
            HNSWStats& search_stats,
215
            VisitedTable& vt,
216
            const SearchParameters* params = nullptr) const;
217
218
    void reset();
219
220
    void clear_neighbor_tables(int level);
221
    void print_neighbor_stats(int level) const;
222
223
    int prepare_level_tab(size_t n, bool preset_levels = false);
224
225
    static void shrink_neighbor_list(
226
            DistanceComputer& qdis,
227
            std::priority_queue<NodeDistFarther>& input,
228
            std::vector<NodeDistFarther>& output,
229
            int max_size,
230
            bool keep_max_size_level0 = false);
231
232
    void permute_entries(const idx_t* map);
233
};
234
235
struct HNSWStats {
236
    size_t n1 = 0; /// number of vectors searched
237
    size_t n2 =
238
            0; /// number of queries for which the candidate list is exhausted
239
    size_t ndis = 0;  /// number of distances computed
240
    size_t nhops = 0; /// number of hops aka number of edges traversed
241
242
0
    void reset() {
243
0
        n1 = n2 = 0;
244
0
        ndis = 0;
245
0
        nhops = 0;
246
0
    }
247
248
220
    void combine(const HNSWStats& other) {
249
220
        n1 += other.n1;
250
220
        n2 += other.n2;
251
220
        ndis += other.ndis;
252
220
        nhops += other.nhops;
253
220
    }
254
};
255
256
// global var that collects them all
257
FAISS_API extern HNSWStats hnsw_stats;
258
259
int search_from_candidates(
260
        const HNSW& hnsw,
261
        DistanceComputer& qdis,
262
        ResultHandler<HNSW::C>& res,
263
        HNSW::MinimaxHeap& candidates,
264
        VisitedTable& vt,
265
        HNSWStats& stats,
266
        int level,
267
        int nres_in = 0,
268
        const SearchParameters* params = nullptr);
269
270
HNSWStats greedy_update_nearest(
271
        const HNSW& hnsw,
272
        DistanceComputer& qdis,
273
        int level,
274
        HNSW::storage_idx_t& nearest,
275
        float& d_nearest);
276
277
std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
278
        const HNSW& hnsw,
279
        const HNSW::Node& node,
280
        DistanceComputer& qdis,
281
        int ef,
282
        VisitedTable* vt,
283
        HNSWStats& stats);
284
285
void search_neighbors_to_add(
286
        HNSW& hnsw,
287
        DistanceComputer& qdis,
288
        std::priority_queue<HNSW::NodeDistCloser>& results,
289
        int entry_point,
290
        float d_entry_point,
291
        int level,
292
        VisitedTable& vt,
293
        bool reference_version = false);
294
295
} // namespace faiss