/root/doris/contrib/faiss/faiss/impl/HNSW.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #pragma once |
9 | | |
10 | | #include <queue> |
11 | | #include <unordered_set> |
12 | | #include <vector> |
13 | | |
14 | | #include <omp.h> |
15 | | |
16 | | #include <faiss/Index.h> |
17 | | #include <faiss/impl/FaissAssert.h> |
18 | | #include <faiss/impl/maybe_owned_vector.h> |
19 | | #include <faiss/impl/platform_macros.h> |
20 | | #include <faiss/utils/Heap.h> |
21 | | #include <faiss/utils/random.h> |
22 | | |
23 | | namespace faiss { |
24 | | |
25 | | /** Implementation of the Hierarchical Navigable Small World |
26 | | * datastructure. |
27 | | * |
28 | | * Efficient and robust approximate nearest neighbor search using |
29 | | * Hierarchical Navigable Small World graphs |
30 | | * |
31 | | * Yu. A. Malkov, D. A. Yashunin, arXiv 2017 |
32 | | * |
33 | | * This implementation is heavily influenced by the NMSlib |
34 | | * implementation by Yury Malkov and Leonid Boystov |
35 | | * (https://github.com/searchivarius/nmslib) |
36 | | * |
37 | | * The HNSW object stores only the neighbor link structure, see |
38 | | * IndexHNSW.h for the full index object. |
39 | | */ |
40 | | |
41 | | struct VisitedTable; |
42 | | struct DistanceComputer; // from AuxIndexStructures |
43 | | struct HNSWStats; |
44 | | template <class C> |
45 | | struct ResultHandler; |
46 | | |
47 | | struct SearchParametersHNSW : SearchParameters { |
48 | | int efSearch = 16; |
49 | | bool check_relative_distance = true; |
50 | | bool bounded_queue = true; |
51 | | |
52 | 100 | ~SearchParametersHNSW() {} |
53 | | }; |
54 | | |
55 | | struct HNSW { |
56 | | /// internal storage of vectors (32 bits: this is expensive) |
57 | | using storage_idx_t = int32_t; |
58 | | |
59 | | // for now we do only these distances |
60 | | using C = CMax<float, int64_t>; |
61 | | |
62 | | typedef std::pair<float, storage_idx_t> Node; |
63 | | |
64 | | /** Heap structure that allows fast |
65 | | */ |
66 | | struct MinimaxHeap { |
67 | | int n; |
68 | | int k; |
69 | | int nvalid; |
70 | | |
71 | | std::vector<storage_idx_t> ids; |
72 | | std::vector<float> dis; |
73 | | typedef faiss::CMax<float, storage_idx_t> HC; |
74 | | |
75 | 97 | explicit MinimaxHeap(int n) : n(n), k(0), nvalid(0), ids(n), dis(n) {} |
76 | | |
77 | | void push(storage_idx_t i, float v); |
78 | | |
79 | | float max() const; |
80 | | |
81 | | int size() const; |
82 | | |
83 | | void clear(); |
84 | | |
85 | | int pop_min(float* vmin_out = nullptr); |
86 | | |
87 | | int count_below(float thresh); |
88 | | }; |
89 | | |
90 | | /// to sort pairs of (id, distance) from nearest to fathest or the reverse |
91 | | struct NodeDistCloser { |
92 | | float d; |
93 | | int id; |
94 | 3.42M | NodeDistCloser(float d, int id) : d(d), id(id) {} |
95 | 22.0M | bool operator<(const NodeDistCloser& obj1) const { |
96 | 22.0M | return d < obj1.d; |
97 | 22.0M | } |
98 | | }; |
99 | | |
100 | | struct NodeDistFarther { |
101 | | float d; |
102 | | int id; |
103 | 3.52M | NodeDistFarther(float d, int id) : d(d), id(id) {} |
104 | 20.8M | bool operator<(const NodeDistFarther& obj1) const { |
105 | 20.8M | return d > obj1.d; |
106 | 20.8M | } |
107 | | }; |
108 | | |
109 | | /// assignment probability to each layer (sum=1) |
110 | | std::vector<double> assign_probas; |
111 | | |
112 | | /// number of neighbors stored per layer (cumulative), should not |
113 | | /// be changed after first add |
114 | | std::vector<int> cum_nneighbor_per_level; |
115 | | |
116 | | /// level of each vector (base level = 1), size = ntotal |
117 | | std::vector<int> levels; |
118 | | |
119 | | /// offsets[i] is the offset in the neighbors array where vector i is stored |
120 | | /// size ntotal + 1 |
121 | | std::vector<size_t> offsets; |
122 | | |
123 | | /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i |
124 | | /// for all levels. this is where all storage goes. |
125 | | MaybeOwnedVector<storage_idx_t> neighbors; |
126 | | |
127 | | /// entry point in the search structure (one of the points with maximum |
128 | | /// level |
129 | | storage_idx_t entry_point = -1; |
130 | | |
131 | | faiss::RandomGenerator rng; |
132 | | |
133 | | /// maximum level |
134 | | int max_level = -1; |
135 | | |
136 | | /// expansion factor at construction time |
137 | | int efConstruction = 40; |
138 | | |
139 | | /// expansion factor at search time |
140 | | int efSearch = 16; |
141 | | |
142 | | /// during search: do we check whether the next best distance is good |
143 | | /// enough? |
144 | | bool check_relative_distance = true; |
145 | | |
146 | | /// use bounded queue during exploration |
147 | | bool search_bounded_queue = true; |
148 | | |
149 | | // methods that initialize the tree sizes |
150 | | |
151 | | /// initialize the assign_probas and cum_nneighbor_per_level to |
152 | | /// have 2*M links on level 0 and M links on levels > 0 |
153 | | void set_default_probas(int M, float levelMult); |
154 | | |
155 | | /// set nb of neighbors for this level (before adding anything) |
156 | | void set_nb_neighbors(int level_no, int n); |
157 | | |
158 | | // methods that access the tree sizes |
159 | | |
160 | | /// nb of neighbors for this level |
161 | | int nb_neighbors(int layer_no) const; |
162 | | |
163 | | /// cumumlative nb up to (and excluding) this level |
164 | | int cum_nb_neighbors(int layer_no) const; |
165 | | |
166 | | /// range of entries in the neighbors table of vertex no at layer_no |
167 | | void neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end) |
168 | | const; |
169 | | |
170 | | /// only mandatory parameter: nb of neighbors |
171 | | explicit HNSW(int M = 32); |
172 | | |
173 | | /// pick a random level for a new point |
174 | | int random_level(); |
175 | | |
176 | | /// add n random levels to table (for debugging...) |
177 | | void fill_with_random_links(size_t n); |
178 | | |
179 | | void add_links_starting_from( |
180 | | DistanceComputer& ptdis, |
181 | | storage_idx_t pt_id, |
182 | | storage_idx_t nearest, |
183 | | float d_nearest, |
184 | | int level, |
185 | | omp_lock_t* locks, |
186 | | VisitedTable& vt, |
187 | | bool keep_max_size_level0 = false); |
188 | | |
189 | | /** add point pt_id on all levels <= pt_level and build the link |
190 | | * structure for them. */ |
191 | | void add_with_locks( |
192 | | DistanceComputer& ptdis, |
193 | | int pt_level, |
194 | | int pt_id, |
195 | | std::vector<omp_lock_t>& locks, |
196 | | VisitedTable& vt, |
197 | | bool keep_max_size_level0 = false); |
198 | | |
199 | | /// search interface for 1 point, single thread |
200 | | HNSWStats search( |
201 | | DistanceComputer& qdis, |
202 | | ResultHandler<C>& res, |
203 | | VisitedTable& vt, |
204 | | const SearchParameters* params = nullptr) const; |
205 | | |
206 | | /// search only in level 0 from a given vertex |
207 | | void search_level_0( |
208 | | DistanceComputer& qdis, |
209 | | ResultHandler<C>& res, |
210 | | idx_t nprobe, |
211 | | const storage_idx_t* nearest_i, |
212 | | const float* nearest_d, |
213 | | int search_type, |
214 | | HNSWStats& search_stats, |
215 | | VisitedTable& vt, |
216 | | const SearchParameters* params = nullptr) const; |
217 | | |
218 | | void reset(); |
219 | | |
220 | | void clear_neighbor_tables(int level); |
221 | | void print_neighbor_stats(int level) const; |
222 | | |
223 | | int prepare_level_tab(size_t n, bool preset_levels = false); |
224 | | |
225 | | static void shrink_neighbor_list( |
226 | | DistanceComputer& qdis, |
227 | | std::priority_queue<NodeDistFarther>& input, |
228 | | std::vector<NodeDistFarther>& output, |
229 | | int max_size, |
230 | | bool keep_max_size_level0 = false); |
231 | | |
232 | | void permute_entries(const idx_t* map); |
233 | | }; |
234 | | |
235 | | struct HNSWStats { |
236 | | size_t n1 = 0; /// number of vectors searched |
237 | | size_t n2 = |
238 | | 0; /// number of queries for which the candidate list is exhausted |
239 | | size_t ndis = 0; /// number of distances computed |
240 | | size_t nhops = 0; /// number of hops aka number of edges traversed |
241 | | |
242 | 0 | void reset() { |
243 | 0 | n1 = n2 = 0; |
244 | 0 | ndis = 0; |
245 | 0 | nhops = 0; |
246 | 0 | } |
247 | | |
248 | 220 | void combine(const HNSWStats& other) { |
249 | 220 | n1 += other.n1; |
250 | 220 | n2 += other.n2; |
251 | 220 | ndis += other.ndis; |
252 | 220 | nhops += other.nhops; |
253 | 220 | } |
254 | | }; |
255 | | |
256 | | // global var that collects them all |
257 | | FAISS_API extern HNSWStats hnsw_stats; |
258 | | |
259 | | int search_from_candidates( |
260 | | const HNSW& hnsw, |
261 | | DistanceComputer& qdis, |
262 | | ResultHandler<HNSW::C>& res, |
263 | | HNSW::MinimaxHeap& candidates, |
264 | | VisitedTable& vt, |
265 | | HNSWStats& stats, |
266 | | int level, |
267 | | int nres_in = 0, |
268 | | const SearchParameters* params = nullptr); |
269 | | |
270 | | HNSWStats greedy_update_nearest( |
271 | | const HNSW& hnsw, |
272 | | DistanceComputer& qdis, |
273 | | int level, |
274 | | HNSW::storage_idx_t& nearest, |
275 | | float& d_nearest); |
276 | | |
277 | | std::priority_queue<HNSW::Node> search_from_candidate_unbounded( |
278 | | const HNSW& hnsw, |
279 | | const HNSW::Node& node, |
280 | | DistanceComputer& qdis, |
281 | | int ef, |
282 | | VisitedTable* vt, |
283 | | HNSWStats& stats); |
284 | | |
285 | | void search_neighbors_to_add( |
286 | | HNSW& hnsw, |
287 | | DistanceComputer& qdis, |
288 | | std::priority_queue<HNSW::NodeDistCloser>& results, |
289 | | int entry_point, |
290 | | float d_entry_point, |
291 | | int level, |
292 | | VisitedTable& vt, |
293 | | bool reference_version = false); |
294 | | |
295 | | } // namespace faiss |