/root/doris/contrib/faiss/faiss/IndexIVF.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- c++ -*- |
9 | | |
10 | | #ifndef FAISS_INDEX_IVF_H |
11 | | #define FAISS_INDEX_IVF_H |
12 | | |
13 | | #include <stdint.h> |
14 | | #include <memory> |
15 | | #include <unordered_map> |
16 | | #include <vector> |
17 | | |
18 | | #include <faiss/Clustering.h> |
19 | | #include <faiss/Index.h> |
20 | | #include <faiss/impl/IDSelector.h> |
21 | | #include <faiss/impl/platform_macros.h> |
22 | | #include <faiss/invlists/DirectMap.h> |
23 | | #include <faiss/invlists/InvertedLists.h> |
24 | | #include <faiss/utils/Heap.h> |
25 | | |
26 | | namespace faiss { |
27 | | |
28 | | /** Encapsulates a quantizer object for the IndexIVF |
29 | | * |
30 | | * The class isolates the fields that are independent of the storage |
31 | | * of the lists (especially training) |
32 | | */ |
33 | | struct Level1Quantizer { |
34 | | /// quantizer that maps vectors to inverted lists |
35 | | Index* quantizer = nullptr; |
36 | | |
37 | | /// number of inverted lists |
38 | | size_t nlist = 0; |
39 | | |
40 | | /** |
41 | | * = 0: use the quantizer as index in a kmeans training |
42 | | * = 1: just pass on the training set to the train() of the quantizer |
43 | | * = 2: kmeans training on a flat index + add the centroids to the quantizer |
44 | | */ |
45 | | char quantizer_trains_alone = 0; |
46 | | bool own_fields = false; ///< whether object owns the quantizer |
47 | | |
48 | | ClusteringParameters cp; ///< to override default clustering params |
49 | | /// to override index used during clustering |
50 | | Index* clustering_index = nullptr; |
51 | | |
52 | | /// Trains the quantizer and calls train_residual to train sub-quantizers |
53 | | void train_q1( |
54 | | size_t n, |
55 | | const float* x, |
56 | | bool verbose, |
57 | | MetricType metric_type); |
58 | | |
59 | | /// compute the number of bytes required to store list ids |
60 | | size_t coarse_code_size() const; |
61 | | void encode_listno(idx_t list_no, uint8_t* code) const; |
62 | | idx_t decode_listno(const uint8_t* code) const; |
63 | | |
64 | | Level1Quantizer(Index* quantizer, size_t nlist); |
65 | | |
66 | | Level1Quantizer(); |
67 | | |
68 | | ~Level1Quantizer(); |
69 | | }; |
70 | | |
71 | | struct SearchParametersIVF : SearchParameters { |
72 | | size_t nprobe = 1; ///< number of probes at query time |
73 | | size_t max_codes = 0; ///< max nb of codes to visit to do a query |
74 | | SearchParameters* quantizer_params = nullptr; |
75 | | /// context object to pass to InvertedLists |
76 | | void* inverted_list_context = nullptr; |
77 | | |
78 | 0 | virtual ~SearchParametersIVF() {} |
79 | | }; |
80 | | |
81 | | // the new convention puts the index type after SearchParameters |
82 | | using IVFSearchParameters = SearchParametersIVF; |
83 | | |
84 | | struct InvertedListScanner; |
85 | | struct IndexIVFStats; |
86 | | struct CodePacker; |
87 | | |
88 | | struct IndexIVFInterface : Level1Quantizer { |
89 | | size_t nprobe = 1; ///< number of probes at query time |
90 | | size_t max_codes = 0; ///< max nb of codes to visit to do a query |
91 | | |
92 | | explicit IndexIVFInterface(Index* quantizer = nullptr, size_t nlist = 0) |
93 | 0 | : Level1Quantizer(quantizer, nlist) {} |
94 | | |
95 | | /** search a set of vectors, that are pre-quantized by the IVF |
96 | | * quantizer. Fill in the corresponding heaps with the query |
97 | | * results. The default implementation uses InvertedListScanners |
98 | | * to do the search. |
99 | | * |
100 | | * @param n nb of vectors to query |
101 | | * @param x query vectors, size nx * d |
102 | | * @param assign coarse quantization indices, size nx * nprobe |
103 | | * @param centroid_dis |
104 | | * distances to coarse centroids, size nx * nprobe |
105 | | * @param distance |
106 | | * output distances, size n * k |
107 | | * @param labels output labels, size n * k |
108 | | * @param store_pairs store inv list index + inv list offset |
109 | | * instead in upper/lower 32 bit of result, |
110 | | * instead of ids (used for reranking). |
111 | | * @param params used to override the object's search parameters |
112 | | * @param stats search stats to be updated (can be null) |
113 | | */ |
114 | | virtual void search_preassigned( |
115 | | idx_t n, |
116 | | const float* x, |
117 | | idx_t k, |
118 | | const idx_t* assign, |
119 | | const float* centroid_dis, |
120 | | float* distances, |
121 | | idx_t* labels, |
122 | | bool store_pairs, |
123 | | const IVFSearchParameters* params = nullptr, |
124 | | IndexIVFStats* stats = nullptr) const = 0; |
125 | | |
126 | | /** Range search a set of vectors, that are pre-quantized by the IVF |
127 | | * quantizer. Fill in the RangeSearchResults results. The default |
128 | | * implementation uses InvertedListScanners to do the search. |
129 | | * |
130 | | * @param n nb of vectors to query |
131 | | * @param x query vectors, size nx * d |
132 | | * @param assign coarse quantization indices, size nx * nprobe |
133 | | * @param centroid_dis |
134 | | * distances to coarse centroids, size nx * nprobe |
135 | | * @param result Output results |
136 | | * @param store_pairs store inv list index + inv list offset |
137 | | * instead in upper/lower 32 bit of result, |
138 | | * instead of ids (used for reranking). |
139 | | * @param params used to override the object's search parameters |
140 | | * @param stats search stats to be updated (can be null) |
141 | | */ |
142 | | virtual void range_search_preassigned( |
143 | | idx_t nx, |
144 | | const float* x, |
145 | | float radius, |
146 | | const idx_t* keys, |
147 | | const float* coarse_dis, |
148 | | RangeSearchResult* result, |
149 | | bool store_pairs = false, |
150 | | const IVFSearchParameters* params = nullptr, |
151 | | IndexIVFStats* stats = nullptr) const = 0; |
152 | | |
153 | 0 | virtual ~IndexIVFInterface() {} |
154 | | }; |
155 | | |
156 | | /** Index based on a inverted file (IVF) |
157 | | * |
158 | | * In the inverted file, the quantizer (an Index instance) provides a |
159 | | * quantization index for each vector to be added. The quantization |
160 | | * index maps to a list (aka inverted list or posting list), where the |
161 | | * id of the vector is stored. |
162 | | * |
163 | | * The inverted list object is required only after trainng. If none is |
164 | | * set externally, an ArrayInvertedLists is used automatically. |
165 | | * |
166 | | * At search time, the vector to be searched is also quantized, and |
167 | | * only the list corresponding to the quantization index is |
168 | | * searched. This speeds up the search by making it |
169 | | * non-exhaustive. This can be relaxed using multi-probe search: a few |
170 | | * (nprobe) quantization indices are selected and several inverted |
171 | | * lists are visited. |
172 | | * |
173 | | * Sub-classes implement a post-filtering of the index that refines |
174 | | * the distance estimation from the query to databse vectors. |
175 | | */ |
176 | | struct IndexIVF : Index, IndexIVFInterface { |
177 | | /// Access to the actual data |
178 | | InvertedLists* invlists = nullptr; |
179 | | bool own_invlists = false; |
180 | | |
181 | | size_t code_size = 0; ///< code size per vector in bytes |
182 | | |
183 | | /** Parallel mode determines how queries are parallelized with OpenMP |
184 | | * |
185 | | * 0 (default): split over queries |
186 | | * 1: parallelize over inverted lists |
187 | | * 2: parallelize over both |
188 | | * 3: split over queries with a finer granularity |
189 | | * |
190 | | * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to |
191 | | * prevent the heap to be initialized and finalized |
192 | | */ |
193 | | int parallel_mode = 0; |
194 | | const int PARALLEL_MODE_NO_HEAP_INIT = 1024; |
195 | | |
196 | | /** optional map that maps back ids to invlist entries. This |
197 | | * enables reconstruct() */ |
198 | | DirectMap direct_map; |
199 | | |
200 | | /// do the codes in the invlists encode the vectors relative to the |
201 | | /// centroids? |
202 | | bool by_residual = true; |
203 | | |
204 | | /** The Inverted file takes a quantizer (an Index) on input, |
205 | | * which implements the function mapping a vector to a list |
206 | | * identifier. |
207 | | */ |
208 | | IndexIVF( |
209 | | Index* quantizer, |
210 | | size_t d, |
211 | | size_t nlist, |
212 | | size_t code_size, |
213 | | MetricType metric = METRIC_L2); |
214 | | |
215 | | void reset() override; |
216 | | |
217 | | /// Trains the quantizer and calls train_encoder to train sub-quantizers |
218 | | void train(idx_t n, const float* x) override; |
219 | | |
220 | | /// Calls add_with_ids with NULL ids |
221 | | void add(idx_t n, const float* x) override; |
222 | | |
223 | | /// default implementation that calls encode_vectors |
224 | | void add_with_ids(idx_t n, const float* x, const idx_t* xids) override; |
225 | | |
226 | | /** Implementation of vector addition where the vector assignments are |
227 | | * predefined. The default implementation hands over the code extraction to |
228 | | * encode_vectors. |
229 | | * |
230 | | * @param precomputed_idx quantization indices for the input vectors |
231 | | * (size n) |
232 | | */ |
233 | | virtual void add_core( |
234 | | idx_t n, |
235 | | const float* x, |
236 | | const idx_t* xids, |
237 | | const idx_t* precomputed_idx, |
238 | | void* inverted_list_context = nullptr); |
239 | | |
240 | | /** Encodes a set of vectors as they would appear in the inverted lists |
241 | | * |
242 | | * @param list_nos inverted list ids as returned by the |
243 | | * quantizer (size n). -1s are ignored. |
244 | | * @param codes output codes, size n * code_size |
245 | | * @param include_listno |
246 | | * include the list ids in the code (in this case add |
247 | | * ceil(log8(nlist)) to the code size) |
248 | | */ |
249 | | virtual void encode_vectors( |
250 | | idx_t n, |
251 | | const float* x, |
252 | | const idx_t* list_nos, |
253 | | uint8_t* codes, |
254 | | bool include_listno = false) const = 0; |
255 | | |
256 | | /** Add vectors that are computed with the standalone codec |
257 | | * |
258 | | * @param codes codes to add size n * sa_code_size() |
259 | | * @param xids corresponding ids, size n |
260 | | */ |
261 | | void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids) |
262 | | override; |
263 | | |
264 | | /** Train the encoder for the vectors. |
265 | | * |
266 | | * If by_residual then it is called with residuals and corresponding assign |
267 | | * array, otherwise x is the raw training vectors and assign=nullptr */ |
268 | | virtual void train_encoder(idx_t n, const float* x, const idx_t* assign); |
269 | | |
270 | | /// can be redefined by subclasses to indicate how many training vectors |
271 | | /// they need |
272 | | virtual idx_t train_encoder_num_vectors() const; |
273 | | |
274 | | void search_preassigned( |
275 | | idx_t n, |
276 | | const float* x, |
277 | | idx_t k, |
278 | | const idx_t* assign, |
279 | | const float* centroid_dis, |
280 | | float* distances, |
281 | | idx_t* labels, |
282 | | bool store_pairs, |
283 | | const IVFSearchParameters* params = nullptr, |
284 | | IndexIVFStats* stats = nullptr) const override; |
285 | | |
286 | | void range_search_preassigned( |
287 | | idx_t nx, |
288 | | const float* x, |
289 | | float radius, |
290 | | const idx_t* keys, |
291 | | const float* coarse_dis, |
292 | | RangeSearchResult* result, |
293 | | bool store_pairs = false, |
294 | | const IVFSearchParameters* params = nullptr, |
295 | | IndexIVFStats* stats = nullptr) const override; |
296 | | |
297 | | /** assign the vectors, then call search_preassign */ |
298 | | void search( |
299 | | idx_t n, |
300 | | const float* x, |
301 | | idx_t k, |
302 | | float* distances, |
303 | | idx_t* labels, |
304 | | const SearchParameters* params = nullptr) const override; |
305 | | |
306 | | void range_search( |
307 | | idx_t n, |
308 | | const float* x, |
309 | | float radius, |
310 | | RangeSearchResult* result, |
311 | | const SearchParameters* params = nullptr) const override; |
312 | | |
313 | | /** Get a scanner for this index (store_pairs means ignore labels) |
314 | | * |
315 | | * The default search implementation uses this to compute the distances. |
316 | | * Use sel instead of params->sel, because sel is initialized with |
317 | | * params->sel, but may get overridden by IndexIVF's internal logic. |
318 | | */ |
319 | | virtual InvertedListScanner* get_InvertedListScanner( |
320 | | bool store_pairs = false, |
321 | | const IDSelector* sel = nullptr, |
322 | | const IVFSearchParameters* params = nullptr) const; |
323 | | |
324 | | /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2 |
325 | | */ |
326 | | void reconstruct(idx_t key, float* recons) const override; |
327 | | |
328 | | /** Update a subset of vectors. |
329 | | * |
330 | | * The index must have a direct_map |
331 | | * |
332 | | * @param nv nb of vectors to update |
333 | | * @param idx vector indices to update, size nv |
334 | | * @param v vectors of new values, size nv*d |
335 | | */ |
336 | | virtual void update_vectors(int nv, const idx_t* idx, const float* v); |
337 | | |
338 | | /** Reconstruct a subset of the indexed vectors. |
339 | | * |
340 | | * Overrides default implementation to bypass reconstruct() which requires |
341 | | * direct_map to be maintained. |
342 | | * |
343 | | * @param i0 first vector to reconstruct |
344 | | * @param ni nb of vectors to reconstruct |
345 | | * @param recons output array of reconstructed vectors, size ni * d |
346 | | */ |
347 | | void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override; |
348 | | |
349 | | /** Similar to search, but also reconstructs the stored vectors (or an |
350 | | * approximation in the case of lossy coding) for the search results. |
351 | | * |
352 | | * Overrides default implementation to avoid having to maintain direct_map |
353 | | * and instead fetch the code offsets through the `store_pairs` flag in |
354 | | * search_preassigned(). |
355 | | * |
356 | | * @param recons reconstructed vectors size (n, k, d) |
357 | | */ |
358 | | void search_and_reconstruct( |
359 | | idx_t n, |
360 | | const float* x, |
361 | | idx_t k, |
362 | | float* distances, |
363 | | idx_t* labels, |
364 | | float* recons, |
365 | | const SearchParameters* params = nullptr) const override; |
366 | | |
367 | | /** Similar to search, but also returns the codes corresponding to the |
368 | | * stored vectors for the search results. |
369 | | * |
370 | | * @param codes codes (n, k, code_size) |
371 | | * @param include_listno |
372 | | * include the list ids in the code (in this case add |
373 | | * ceil(log8(nlist)) to the code size) |
374 | | */ |
375 | | void search_and_return_codes( |
376 | | idx_t n, |
377 | | const float* x, |
378 | | idx_t k, |
379 | | float* distances, |
380 | | idx_t* labels, |
381 | | uint8_t* recons, |
382 | | bool include_listno = false, |
383 | | const SearchParameters* params = nullptr) const; |
384 | | |
385 | | /** Reconstruct a vector given the location in terms of (inv list index + |
386 | | * inv list offset) instead of the id. |
387 | | * |
388 | | * Useful for reconstructing when the direct_map is not maintained and |
389 | | * the inv list offset is computed by search_preassigned() with |
390 | | * `store_pairs` set. |
391 | | */ |
392 | | virtual void reconstruct_from_offset( |
393 | | int64_t list_no, |
394 | | int64_t offset, |
395 | | float* recons) const; |
396 | | |
397 | | /// Dataset manipulation functions |
398 | | |
399 | | size_t remove_ids(const IDSelector& sel) override; |
400 | | |
401 | | void check_compatible_for_merge(const Index& otherIndex) const override; |
402 | | |
403 | | virtual void merge_from(Index& otherIndex, idx_t add_id) override; |
404 | | |
405 | | // returns a new instance of a CodePacker |
406 | | virtual CodePacker* get_CodePacker() const; |
407 | | |
408 | | /** copy a subset of the entries index to the other index |
409 | | * see Invlists::copy_subset_to for the meaning of subset_type |
410 | | */ |
411 | | virtual void copy_subset_to( |
412 | | IndexIVF& other, |
413 | | InvertedLists::subset_type_t subset_type, |
414 | | idx_t a1, |
415 | | idx_t a2) const; |
416 | | |
417 | | ~IndexIVF() override; |
418 | | |
419 | 0 | size_t get_list_size(size_t list_no) const { |
420 | 0 | return invlists->list_size(list_no); |
421 | 0 | } |
422 | | |
423 | | /// are the ids sorted? |
424 | | bool check_ids_sorted() const; |
425 | | |
426 | | /** initialize a direct map |
427 | | * |
428 | | * @param new_maintain_direct_map if true, create a direct map, |
429 | | * else clear it |
430 | | */ |
431 | | void make_direct_map(bool new_maintain_direct_map = true); |
432 | | |
433 | | void set_direct_map_type(DirectMap::Type type); |
434 | | |
435 | | /// replace the inverted lists, old one is deallocated if own_invlists |
436 | | void replace_invlists(InvertedLists* il, bool own = false); |
437 | | |
438 | | /* The standalone codec interface (except sa_decode that is specific) */ |
439 | | size_t sa_code_size() const override; |
440 | | |
441 | | /** encode a set of vectors |
442 | | * sa_encode will call encode_vectors with include_listno=true |
443 | | * @param n nb of vectors to encode |
444 | | * @param x the vectors to encode |
445 | | * @param bytes output array for the codes |
446 | | * @return nb of bytes written to codes |
447 | | */ |
448 | | void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override; |
449 | | |
450 | | IndexIVF(); |
451 | | }; |
452 | | |
453 | | struct RangeQueryResult; |
454 | | |
455 | | /** Object that handles a query. The inverted lists to scan are |
456 | | * provided externally. The object has a lot of state, but |
457 | | * distance_to_code and scan_codes can be called in multiple |
458 | | * threads */ |
459 | | struct InvertedListScanner { |
460 | | idx_t list_no = -1; ///< remember current list |
461 | | bool keep_max = false; ///< keep maximum instead of minimum |
462 | | /// store positions in invlists rather than labels |
463 | | bool store_pairs; |
464 | | |
465 | | /// search in this subset of ids |
466 | | const IDSelector* sel; |
467 | | |
468 | | InvertedListScanner( |
469 | | bool store_pairs = false, |
470 | | const IDSelector* sel = nullptr) |
471 | 0 | : store_pairs(store_pairs), sel(sel) {} |
472 | | |
473 | | /// used in default implementation of scan_codes |
474 | | size_t code_size = 0; |
475 | | |
476 | | /// from now on we handle this query. |
477 | | virtual void set_query(const float* query_vector) = 0; |
478 | | |
479 | | /// following codes come from this inverted list |
480 | | virtual void set_list(idx_t list_no, float coarse_dis) = 0; |
481 | | |
482 | | /// compute a single query-to-code distance |
483 | | virtual float distance_to_code(const uint8_t* code) const = 0; |
484 | | |
485 | | /** scan a set of codes, compute distances to current query and |
486 | | * update heap of results if necessary. Default implementation |
487 | | * calls distance_to_code. |
488 | | * |
489 | | * @param n number of codes to scan |
490 | | * @param codes codes to scan (n * code_size) |
491 | | * @param ids corresponding ids (ignored if store_pairs) |
492 | | * @param distances heap distances (size k) |
493 | | * @param labels heap labels (size k) |
494 | | * @param k heap size |
495 | | * @return number of heap updates performed |
496 | | */ |
497 | | virtual size_t scan_codes( |
498 | | size_t n, |
499 | | const uint8_t* codes, |
500 | | const idx_t* ids, |
501 | | float* distances, |
502 | | idx_t* labels, |
503 | | size_t k) const; |
504 | | |
505 | | // same as scan_codes, using an iterator |
506 | | virtual size_t iterate_codes( |
507 | | InvertedListsIterator* iterator, |
508 | | float* distances, |
509 | | idx_t* labels, |
510 | | size_t k, |
511 | | size_t& list_size) const; |
512 | | |
513 | | /** scan a set of codes, compute distances to current query and |
514 | | * update results if distances are below radius |
515 | | * |
516 | | * (default implementation fails) */ |
517 | | virtual void scan_codes_range( |
518 | | size_t n, |
519 | | const uint8_t* codes, |
520 | | const idx_t* ids, |
521 | | float radius, |
522 | | RangeQueryResult& result) const; |
523 | | |
524 | | // same as scan_codes_range, using an iterator |
525 | | virtual void iterate_codes_range( |
526 | | InvertedListsIterator* iterator, |
527 | | float radius, |
528 | | RangeQueryResult& result, |
529 | | size_t& list_size) const; |
530 | | |
531 | 0 | virtual ~InvertedListScanner() {} |
532 | | }; |
533 | | |
534 | | // whether to check that coarse quantizers are the same |
535 | | FAISS_API extern bool check_compatible_for_merge_expensive_check; |
536 | | |
537 | | struct IndexIVFStats { |
538 | | size_t nq; // nb of queries run |
539 | | size_t nlist; // nb of inverted lists scanned |
540 | | size_t ndis; // nb of distances computed |
541 | | size_t nheap_updates; // nb of times the heap was updated |
542 | | double quantization_time; // time spent quantizing vectors (in ms) |
543 | | double search_time; // time spent searching lists (in ms) |
544 | | |
545 | 9 | IndexIVFStats() { |
546 | 9 | reset(); |
547 | 9 | } |
548 | | void reset(); |
549 | | void add(const IndexIVFStats& other); |
550 | | }; |
551 | | |
552 | | // global var that collects them all |
553 | | FAISS_API extern IndexIVFStats indexIVF_stats; |
554 | | |
555 | | } // namespace faiss |
556 | | |
557 | | #endif |