/root/doris/contrib/faiss/faiss/IndexBinaryIVF.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- c++ -*- |
9 | | |
10 | | #ifndef FAISS_INDEX_BINARY_IVF_H |
11 | | #define FAISS_INDEX_BINARY_IVF_H |
12 | | |
13 | | #include <vector> |
14 | | |
15 | | #include <faiss/Clustering.h> |
16 | | #include <faiss/IndexBinary.h> |
17 | | #include <faiss/IndexIVF.h> |
18 | | #include <faiss/utils/Heap.h> |
19 | | |
20 | | namespace faiss { |
21 | | |
22 | | struct BinaryInvertedListScanner; |
23 | | |
24 | | /** Index based on a inverted file (IVF) |
25 | | * |
26 | | * In the inverted file, the quantizer (an IndexBinary instance) provides a |
27 | | * quantization index for each vector to be added. The quantization |
28 | | * index maps to a list (aka inverted list or posting list), where the |
29 | | * id of the vector is stored. |
30 | | * |
31 | | * Otherwise the object is similar to the IndexIVF |
32 | | */ |
33 | | struct IndexBinaryIVF : IndexBinary { |
34 | | /// Access to the actual data |
35 | | InvertedLists* invlists = nullptr; |
36 | | bool own_invlists = true; |
37 | | |
38 | | size_t nprobe = 1; ///< number of probes at query time |
39 | | size_t max_codes = 0; ///< max nb of codes to visit to do a query |
40 | | |
41 | | /** Select between using a heap or counting to select the k smallest values |
42 | | * when scanning inverted lists. |
43 | | */ |
44 | | bool use_heap = true; |
45 | | |
46 | | /** collect computations per batch */ |
47 | | bool per_invlist_search = false; |
48 | | |
49 | | /// map for direct access to the elements. Enables reconstruct(). |
50 | | DirectMap direct_map; |
51 | | |
52 | | /// quantizer that maps vectors to inverted lists |
53 | | IndexBinary* quantizer = nullptr; |
54 | | |
55 | | /// number of possible key values |
56 | | size_t nlist = 0; |
57 | | |
58 | | /// whether object owns the quantizer |
59 | | bool own_fields = false; |
60 | | |
61 | | ClusteringParameters cp; ///< to override default clustering params |
62 | | |
63 | | /// to override index used during clustering |
64 | | Index* clustering_index = nullptr; |
65 | | |
66 | | /** The Inverted file takes a quantizer (an IndexBinary) on input, |
67 | | * which implements the function mapping a vector to a list |
68 | | * identifier. The pointer is borrowed: the quantizer should not |
69 | | * be deleted while the IndexBinaryIVF is in use. |
70 | | */ |
71 | | IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist); |
72 | | |
73 | | IndexBinaryIVF(); |
74 | | |
75 | | ~IndexBinaryIVF() override; |
76 | | |
77 | | void reset() override; |
78 | | |
79 | | /// Trains the quantizer |
80 | | void train(idx_t n, const uint8_t* x) override; |
81 | | |
82 | | void add(idx_t n, const uint8_t* x) override; |
83 | | |
84 | | void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override; |
85 | | |
86 | | /** Implementation of vector addition where the vector assignments are |
87 | | * predefined. |
88 | | * |
89 | | * @param precomputed_idx quantization indices for the input vectors |
90 | | * (size n) |
91 | | */ |
92 | | void add_core( |
93 | | idx_t n, |
94 | | const uint8_t* x, |
95 | | const idx_t* xids, |
96 | | const idx_t* precomputed_idx); |
97 | | |
98 | | /** Search a set of vectors, that are pre-quantized by the IVF |
99 | | * quantizer. Fill in the corresponding heaps with the query |
100 | | * results. search() calls this. |
101 | | * |
102 | | * @param n nb of vectors to query |
103 | | * @param x query vectors, size nx * d |
104 | | * @param assign coarse quantization indices, size nx * nprobe |
105 | | * @param centroid_dis |
106 | | * distances to coarse centroids, size nx * nprobe |
107 | | * @param distance |
108 | | * output distances, size n * k |
109 | | * @param labels output labels, size n * k |
110 | | * @param store_pairs store inv list index + inv list offset |
111 | | * instead in upper/lower 32 bit of result, |
112 | | * instead of ids (used for reranking). |
113 | | * @param params used to override the object's search parameters |
114 | | */ |
115 | | void search_preassigned( |
116 | | idx_t n, |
117 | | const uint8_t* x, |
118 | | idx_t k, |
119 | | const idx_t* assign, |
120 | | const int32_t* centroid_dis, |
121 | | int32_t* distances, |
122 | | idx_t* labels, |
123 | | bool store_pairs, |
124 | | const IVFSearchParameters* params = nullptr) const; |
125 | | |
126 | | virtual BinaryInvertedListScanner* get_InvertedListScanner( |
127 | | bool store_pairs = false) const; |
128 | | |
129 | | /** assign the vectors, then call search_preassign */ |
130 | | void search( |
131 | | idx_t n, |
132 | | const uint8_t* x, |
133 | | idx_t k, |
134 | | int32_t* distances, |
135 | | idx_t* labels, |
136 | | const SearchParameters* params = nullptr) const override; |
137 | | |
138 | | void range_search( |
139 | | idx_t n, |
140 | | const uint8_t* x, |
141 | | int radius, |
142 | | RangeSearchResult* result, |
143 | | const SearchParameters* params = nullptr) const override; |
144 | | |
145 | | void range_search_preassigned( |
146 | | idx_t n, |
147 | | const uint8_t* x, |
148 | | int radius, |
149 | | const idx_t* assign, |
150 | | const int32_t* centroid_dis, |
151 | | RangeSearchResult* result) const; |
152 | | |
153 | | void reconstruct(idx_t key, uint8_t* recons) const override; |
154 | | |
155 | | /** Reconstruct a subset of the indexed vectors. |
156 | | * |
157 | | * Overrides default implementation to bypass reconstruct() which requires |
158 | | * direct_map to be maintained. |
159 | | * |
160 | | * @param i0 first vector to reconstruct |
161 | | * @param ni nb of vectors to reconstruct |
162 | | * @param recons output array of reconstructed vectors, size ni * d / 8 |
163 | | */ |
164 | | void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const override; |
165 | | |
166 | | /** Similar to search, but also reconstructs the stored vectors (or an |
167 | | * approximation in the case of lossy coding) for the search results. |
168 | | * |
169 | | * Overrides default implementation to avoid having to maintain direct_map |
170 | | * and instead fetch the code offsets through the `store_pairs` flag in |
171 | | * search_preassigned(). |
172 | | * |
173 | | * @param recons reconstructed vectors size (n, k, d / 8) |
174 | | */ |
175 | | void search_and_reconstruct( |
176 | | idx_t n, |
177 | | const uint8_t* x, |
178 | | idx_t k, |
179 | | int32_t* distances, |
180 | | idx_t* labels, |
181 | | uint8_t* recons, |
182 | | const SearchParameters* params = nullptr) const override; |
183 | | |
184 | | /** Reconstruct a vector given the location in terms of (inv list index + |
185 | | * inv list offset) instead of the id. |
186 | | * |
187 | | * Useful for reconstructing when the direct_map is not maintained and |
188 | | * the inv list offset is computed by search_preassigned() with |
189 | | * `store_pairs` set. |
190 | | */ |
191 | | virtual void reconstruct_from_offset( |
192 | | idx_t list_no, |
193 | | idx_t offset, |
194 | | uint8_t* recons) const; |
195 | | |
196 | | /// Dataset manipulation functions |
197 | | size_t remove_ids(const IDSelector& sel) override; |
198 | | |
199 | | void merge_from(IndexBinary& other, idx_t add_id) override; |
200 | | |
201 | | void check_compatible_for_merge( |
202 | | const IndexBinary& otherIndex) const override; |
203 | | |
204 | 0 | size_t get_list_size(size_t list_no) const { |
205 | 0 | return invlists->list_size(list_no); |
206 | 0 | } |
207 | | |
208 | | /** initialize a direct map |
209 | | * |
210 | | * @param new_maintain_direct_map if true, create a direct map, |
211 | | * else clear it |
212 | | */ |
213 | | void make_direct_map(bool new_maintain_direct_map = true); |
214 | | |
215 | | void set_direct_map_type(DirectMap::Type type); |
216 | | |
217 | | void replace_invlists(InvertedLists* il, bool own = false); |
218 | | }; |
219 | | |
220 | | struct BinaryInvertedListScanner { |
221 | | /// from now on we handle this query. |
222 | | virtual void set_query(const uint8_t* query_vector) = 0; |
223 | | |
224 | | /// following codes come from this inverted list |
225 | | virtual void set_list(idx_t list_no, uint8_t coarse_dis) = 0; |
226 | | |
227 | | /// compute a single query-to-code distance |
228 | | virtual uint32_t distance_to_code(const uint8_t* code) const = 0; |
229 | | |
230 | | /** compute the distances to codes. (distances, labels) should be |
231 | | * organized as a min- or max-heap |
232 | | * |
233 | | * @param n number of codes to scan |
234 | | * @param codes codes to scan (n * code_size) |
235 | | * @param ids corresponding ids (ignored if store_pairs) |
236 | | * @param distances heap distances (size k) |
237 | | * @param labels heap labels (size k) |
238 | | * @param k heap size |
239 | | */ |
240 | | virtual size_t scan_codes( |
241 | | size_t n, |
242 | | const uint8_t* codes, |
243 | | const idx_t* ids, |
244 | | int32_t* distances, |
245 | | idx_t* labels, |
246 | | size_t k) const = 0; |
247 | | |
248 | | virtual void scan_codes_range( |
249 | | size_t n, |
250 | | const uint8_t* codes, |
251 | | const idx_t* ids, |
252 | | int radius, |
253 | | RangeQueryResult& result) const = 0; |
254 | | |
255 | 0 | virtual ~BinaryInvertedListScanner() {} |
256 | | }; |
257 | | |
258 | | } // namespace faiss |
259 | | |
260 | | #endif // FAISS_INDEX_BINARY_IVF_H |