contrib/faiss/faiss/invlists/InvertedLists.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- c++ -*- |
9 | | |
10 | | #ifndef FAISS_INVERTEDLISTS_IVF_H |
11 | | #define FAISS_INVERTEDLISTS_IVF_H |
12 | | |
13 | | /** |
14 | | * Definition of inverted lists + a few common classes that implement |
15 | | * the interface. |
16 | | */ |
17 | | |
18 | | #include <vector> |
19 | | |
20 | | #include <faiss/MetricType.h> |
21 | | #include <faiss/impl/maybe_owned_vector.h> |
22 | | |
23 | | namespace faiss { |
24 | | |
25 | | struct InvertedListsIterator { |
26 | | virtual ~InvertedListsIterator(); |
27 | | virtual bool is_available() const = 0; |
28 | | virtual void next() = 0; |
29 | | virtual std::pair<idx_t, const uint8_t*> get_id_and_codes() = 0; |
30 | | }; |
31 | | |
32 | | /** Table of inverted lists |
33 | | * multithreading rules: |
34 | | * - concurrent read accesses are allowed |
35 | | * - concurrent update accesses are allowed |
36 | | * - for resize and add_entries, only concurrent access to different lists |
37 | | * are allowed |
38 | | */ |
39 | | struct InvertedLists { |
40 | | size_t nlist; ///< number of possible key values |
41 | | size_t code_size; ///< code size per vector in bytes |
42 | | |
43 | | /// request to use iterator rather than get_codes / get_ids |
44 | | bool use_iterator = false; |
45 | | |
46 | | InvertedLists(size_t nlist, size_t code_size); |
47 | | |
48 | | virtual ~InvertedLists(); |
49 | | |
50 | | /// used for BlockInvertedLists, where the codes are packed into groups |
51 | | /// and the individual code size is meaningless |
52 | | static const size_t INVALID_CODE_SIZE = static_cast<size_t>(-1); |
53 | | |
54 | | /************************* |
55 | | * Read only functions */ |
56 | | |
57 | | /// get the size of a list |
58 | | virtual size_t list_size(size_t list_no) const = 0; |
59 | | |
60 | | /** get the codes for an inverted list |
61 | | * must be released by release_codes |
62 | | * |
63 | | * @return codes size list_size * code_size |
64 | | */ |
65 | | virtual const uint8_t* get_codes(size_t list_no) const = 0; |
66 | | |
67 | | /** get the ids for an inverted list |
68 | | * must be released by release_ids |
69 | | * |
70 | | * @return ids size list_size |
71 | | */ |
72 | | virtual const idx_t* get_ids(size_t list_no) const = 0; |
73 | | |
74 | | /// release codes returned by get_codes (default implementation is nop |
75 | | virtual void release_codes(size_t list_no, const uint8_t* codes) const; |
76 | | |
77 | | /// release ids returned by get_ids |
78 | | virtual void release_ids(size_t list_no, const idx_t* ids) const; |
79 | | |
80 | | /// @return a single id in an inverted list |
81 | | virtual idx_t get_single_id(size_t list_no, size_t offset) const; |
82 | | |
83 | | /// @return a single code in an inverted list |
84 | | /// (should be deallocated with release_codes) |
85 | | virtual const uint8_t* get_single_code(size_t list_no, size_t offset) const; |
86 | | |
87 | | /// prepare the following lists (default does nothing) |
88 | | /// a list can be -1 hence the signed long |
89 | | virtual void prefetch_lists(const idx_t* list_nos, int nlist) const; |
90 | | |
91 | | /***************************************** |
92 | | * Iterator interface (with context) */ |
93 | | |
94 | | /// check if the list is empty |
95 | | virtual bool is_empty(size_t list_no, void* inverted_list_context = nullptr) |
96 | | const; |
97 | | |
98 | | /// get iterable for lists that use_iterator |
99 | | virtual InvertedListsIterator* get_iterator( |
100 | | size_t list_no, |
101 | | void* inverted_list_context = nullptr) const; |
102 | | |
103 | | /************************* |
104 | | * writing functions */ |
105 | | |
106 | | /// add one entry to an inverted list |
107 | | virtual size_t add_entry( |
108 | | size_t list_no, |
109 | | idx_t theid, |
110 | | const uint8_t* code, |
111 | | void* inverted_list_context = nullptr); |
112 | | |
113 | | virtual size_t add_entries( |
114 | | size_t list_no, |
115 | | size_t n_entry, |
116 | | const idx_t* ids, |
117 | | const uint8_t* code) = 0; |
118 | | |
119 | | virtual void update_entry( |
120 | | size_t list_no, |
121 | | size_t offset, |
122 | | idx_t id, |
123 | | const uint8_t* code); |
124 | | |
125 | | virtual void update_entries( |
126 | | size_t list_no, |
127 | | size_t offset, |
128 | | size_t n_entry, |
129 | | const idx_t* ids, |
130 | | const uint8_t* code) = 0; |
131 | | |
132 | | virtual void resize(size_t list_no, size_t new_size) = 0; |
133 | | |
134 | | virtual void reset(); |
135 | | |
136 | | /************************* |
137 | | * high level functions */ |
138 | | |
139 | | /// move all entries from oivf (empty on output) |
140 | | void merge_from(InvertedLists* oivf, size_t add_id); |
141 | | |
142 | | // how to copy a subset of elements from the inverted lists |
143 | | // This depends on two integers, a1 and a2. |
144 | | enum subset_type_t : int { |
145 | | // depends on IDs |
146 | | SUBSET_TYPE_ID_RANGE = 0, // copies ids in [a1, a2) |
147 | | SUBSET_TYPE_ID_MOD = 1, // copies ids if id % a1 == a2 |
148 | | // depends on order within invlists |
149 | | SUBSET_TYPE_ELEMENT_RANGE = |
150 | | 2, // copies fractions of invlists so that a1 elements are left |
151 | | // before and a2 after |
152 | | SUBSET_TYPE_INVLIST_FRACTION = |
153 | | 3, // take fraction a2 out of a1 from each invlist, 0 <= a2 < a1 |
154 | | // copy only inverted lists a1:a2 |
155 | | SUBSET_TYPE_INVLIST = 4 |
156 | | }; |
157 | | |
158 | | /** copy a subset of the entries index to the other index |
159 | | * @return number of entries copied |
160 | | */ |
161 | | size_t copy_subset_to( |
162 | | InvertedLists& other, |
163 | | subset_type_t subset_type, |
164 | | idx_t a1, |
165 | | idx_t a2) const; |
166 | | |
167 | | /************************* |
168 | | * statistics */ |
169 | | |
170 | | /// 1= perfectly balanced, >1: imbalanced |
171 | | double imbalance_factor() const; |
172 | | |
173 | | /// display some stats about the inverted lists |
174 | | void print_stats() const; |
175 | | |
176 | | /// sum up list sizes |
177 | | size_t compute_ntotal() const; |
178 | | |
179 | | /************************************** |
180 | | * Scoped inverted lists (for automatic deallocation) |
181 | | * |
182 | | * instead of writing: |
183 | | * |
184 | | * uint8_t * codes = invlists->get_codes (10); |
185 | | * ... use codes |
186 | | * invlists->release_codes(10, codes) |
187 | | * |
188 | | * write: |
189 | | * |
190 | | * ScopedCodes codes (invlists, 10); |
191 | | * ... use codes.get() |
192 | | * // release called automatically when codes goes out of scope |
193 | | * |
194 | | * the following function call also works: |
195 | | * |
196 | | * foo (123, ScopedCodes (invlists, 10).get(), 456); |
197 | | * |
198 | | */ |
199 | | |
200 | | struct ScopedIds { |
201 | | const InvertedLists* il; |
202 | | const idx_t* ids; |
203 | | size_t list_no; |
204 | | |
205 | | ScopedIds(const InvertedLists* il, size_t list_no) |
206 | 122 | : il(il), ids(il->get_ids(list_no)), list_no(list_no) {} |
207 | | |
208 | 122 | const idx_t* get() { |
209 | 122 | return ids; |
210 | 122 | } |
211 | | |
212 | 0 | idx_t operator[](size_t i) const { |
213 | 0 | return ids[i]; |
214 | 0 | } |
215 | | |
216 | 122 | ~ScopedIds() { |
217 | 122 | il->release_ids(list_no, ids); |
218 | 122 | } |
219 | | }; |
220 | | |
221 | | struct ScopedCodes { |
222 | | const InvertedLists* il; |
223 | | const uint8_t* codes; |
224 | | size_t list_no; |
225 | | |
226 | | ScopedCodes(const InvertedLists* il, size_t list_no) |
227 | 122 | : il(il), codes(il->get_codes(list_no)), list_no(list_no) {} |
228 | | |
229 | | ScopedCodes(const InvertedLists* il, size_t list_no, size_t offset) |
230 | 0 | : il(il), |
231 | 0 | codes(il->get_single_code(list_no, offset)), |
232 | 0 | list_no(list_no) {} |
233 | | |
234 | 122 | const uint8_t* get() { |
235 | 122 | return codes; |
236 | 122 | } |
237 | | |
238 | 122 | ~ScopedCodes() { |
239 | 122 | il->release_codes(list_no, codes); |
240 | 122 | } |
241 | | }; |
242 | | }; |
243 | | |
244 | | /// simple (default) implementation as an array of inverted lists |
245 | | struct ArrayInvertedLists : InvertedLists { |
246 | | std::vector<MaybeOwnedVector<uint8_t>> codes; // binary codes, size nlist |
247 | | std::vector<MaybeOwnedVector<idx_t>> ids; ///< Inverted lists for indexes |
248 | | |
249 | | ArrayInvertedLists(size_t nlist, size_t code_size); |
250 | | |
251 | | size_t list_size(size_t list_no) const override; |
252 | | const uint8_t* get_codes(size_t list_no) const override; |
253 | | const idx_t* get_ids(size_t list_no) const override; |
254 | | |
255 | | size_t add_entries( |
256 | | size_t list_no, |
257 | | size_t n_entry, |
258 | | const idx_t* ids, |
259 | | const uint8_t* code) override; |
260 | | |
261 | | void update_entries( |
262 | | size_t list_no, |
263 | | size_t offset, |
264 | | size_t n_entry, |
265 | | const idx_t* ids, |
266 | | const uint8_t* code) override; |
267 | | |
268 | | void resize(size_t list_no, size_t new_size) override; |
269 | | |
270 | | /// permute the inverted lists, map maps new_id to old_id |
271 | | void permute_invlists(const idx_t* map); |
272 | | |
273 | | bool is_empty(size_t list_no, void* inverted_list_context = nullptr) |
274 | | const override; |
275 | | |
276 | | ~ArrayInvertedLists() override; |
277 | | }; |
278 | | |
279 | | /***************************************************************** |
280 | | * Meta-inverted lists |
281 | | * |
282 | | * About terminology: the inverted lists are seen as a sparse matrix, |
283 | | * that can be stacked horizontally, vertically and sliced. |
284 | | *****************************************************************/ |
285 | | |
286 | | /// invlists that fail for all write functions |
287 | | struct ReadOnlyInvertedLists : InvertedLists { |
288 | | ReadOnlyInvertedLists(size_t nlist, size_t code_size) |
289 | 0 | : InvertedLists(nlist, code_size) {} |
290 | | |
291 | | size_t add_entries( |
292 | | size_t list_no, |
293 | | size_t n_entry, |
294 | | const idx_t* ids, |
295 | | const uint8_t* code) override; |
296 | | |
297 | | void update_entries( |
298 | | size_t list_no, |
299 | | size_t offset, |
300 | | size_t n_entry, |
301 | | const idx_t* ids, |
302 | | const uint8_t* code) override; |
303 | | |
304 | | void resize(size_t list_no, size_t new_size) override; |
305 | | }; |
306 | | |
307 | | /// Horizontal stack of inverted lists |
308 | | struct HStackInvertedLists : ReadOnlyInvertedLists { |
309 | | std::vector<const InvertedLists*> ils; |
310 | | |
311 | | /// build InvertedLists by concatenating nil of them |
312 | | HStackInvertedLists(int nil, const InvertedLists** ils); |
313 | | |
314 | | size_t list_size(size_t list_no) const override; |
315 | | const uint8_t* get_codes(size_t list_no) const override; |
316 | | const idx_t* get_ids(size_t list_no) const override; |
317 | | |
318 | | void prefetch_lists(const idx_t* list_nos, int nlist) const override; |
319 | | |
320 | | void release_codes(size_t list_no, const uint8_t* codes) const override; |
321 | | void release_ids(size_t list_no, const idx_t* ids) const override; |
322 | | |
323 | | idx_t get_single_id(size_t list_no, size_t offset) const override; |
324 | | |
325 | | const uint8_t* get_single_code(size_t list_no, size_t offset) |
326 | | const override; |
327 | | }; |
328 | | |
329 | | using ConcatenatedInvertedLists = HStackInvertedLists; |
330 | | |
331 | | /// vertical slice of indexes in another InvertedLists |
332 | | struct SliceInvertedLists : ReadOnlyInvertedLists { |
333 | | const InvertedLists* il; |
334 | | idx_t i0, i1; |
335 | | |
336 | | SliceInvertedLists(const InvertedLists* il, idx_t i0, idx_t i1); |
337 | | |
338 | | size_t list_size(size_t list_no) const override; |
339 | | const uint8_t* get_codes(size_t list_no) const override; |
340 | | const idx_t* get_ids(size_t list_no) const override; |
341 | | |
342 | | void release_codes(size_t list_no, const uint8_t* codes) const override; |
343 | | void release_ids(size_t list_no, const idx_t* ids) const override; |
344 | | |
345 | | idx_t get_single_id(size_t list_no, size_t offset) const override; |
346 | | |
347 | | const uint8_t* get_single_code(size_t list_no, size_t offset) |
348 | | const override; |
349 | | |
350 | | void prefetch_lists(const idx_t* list_nos, int nlist) const override; |
351 | | }; |
352 | | |
353 | | struct VStackInvertedLists : ReadOnlyInvertedLists { |
354 | | std::vector<const InvertedLists*> ils; |
355 | | std::vector<idx_t> cumsz; |
356 | | |
357 | | /// build InvertedLists by concatenating nil of them |
358 | | VStackInvertedLists(int nil, const InvertedLists** ils); |
359 | | |
360 | | size_t list_size(size_t list_no) const override; |
361 | | const uint8_t* get_codes(size_t list_no) const override; |
362 | | const idx_t* get_ids(size_t list_no) const override; |
363 | | |
364 | | void release_codes(size_t list_no, const uint8_t* codes) const override; |
365 | | void release_ids(size_t list_no, const idx_t* ids) const override; |
366 | | |
367 | | idx_t get_single_id(size_t list_no, size_t offset) const override; |
368 | | |
369 | | const uint8_t* get_single_code(size_t list_no, size_t offset) |
370 | | const override; |
371 | | |
372 | | void prefetch_lists(const idx_t* list_nos, int nlist) const override; |
373 | | }; |
374 | | |
375 | | /** use the first inverted lists if they are non-empty otherwise use the second |
376 | | * |
377 | | * This is useful if il1 has a few inverted lists that are too long, |
378 | | * and that il0 has replacement lists for those, with empty lists for |
379 | | * the others. */ |
380 | | struct MaskedInvertedLists : ReadOnlyInvertedLists { |
381 | | const InvertedLists* il0; |
382 | | const InvertedLists* il1; |
383 | | |
384 | | MaskedInvertedLists(const InvertedLists* il0, const InvertedLists* il1); |
385 | | |
386 | | size_t list_size(size_t list_no) const override; |
387 | | const uint8_t* get_codes(size_t list_no) const override; |
388 | | const idx_t* get_ids(size_t list_no) const override; |
389 | | |
390 | | void release_codes(size_t list_no, const uint8_t* codes) const override; |
391 | | void release_ids(size_t list_no, const idx_t* ids) const override; |
392 | | |
393 | | idx_t get_single_id(size_t list_no, size_t offset) const override; |
394 | | |
395 | | const uint8_t* get_single_code(size_t list_no, size_t offset) |
396 | | const override; |
397 | | |
398 | | void prefetch_lists(const idx_t* list_nos, int nlist) const override; |
399 | | }; |
400 | | |
401 | | /** if the inverted list in il is smaller than maxsize then return it, |
402 | | * otherwise return an empty invlist */ |
403 | | struct StopWordsInvertedLists : ReadOnlyInvertedLists { |
404 | | const InvertedLists* il0; |
405 | | size_t maxsize; |
406 | | |
407 | | StopWordsInvertedLists(const InvertedLists* il, size_t maxsize); |
408 | | |
409 | | size_t list_size(size_t list_no) const override; |
410 | | const uint8_t* get_codes(size_t list_no) const override; |
411 | | const idx_t* get_ids(size_t list_no) const override; |
412 | | |
413 | | void release_codes(size_t list_no, const uint8_t* codes) const override; |
414 | | void release_ids(size_t list_no, const idx_t* ids) const override; |
415 | | |
416 | | idx_t get_single_id(size_t list_no, size_t offset) const override; |
417 | | |
418 | | const uint8_t* get_single_code(size_t list_no, size_t offset) |
419 | | const override; |
420 | | |
421 | | void prefetch_lists(const idx_t* list_nos, int nlist) const override; |
422 | | }; |
423 | | |
424 | | } // namespace faiss |
425 | | |
426 | | #endif |