/root/doris/contrib/faiss/faiss/Clustering.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | /** Implementation of k-means clustering with many variants. */ |
9 | | |
10 | | #ifndef FAISS_CLUSTERING_H |
11 | | #define FAISS_CLUSTERING_H |
12 | | #include <faiss/Index.h> |
13 | | |
14 | | #include <vector> |
15 | | |
16 | | namespace faiss { |
17 | | |
18 | | /** Class for the clustering parameters. Can be passed to the |
19 | | * constructor of the Clustering object. |
20 | | */ |
21 | | struct ClusteringParameters { |
22 | | /// number of clustering iterations |
23 | | int niter = 25; |
24 | | /// redo clustering this many times and keep the clusters with the best |
25 | | /// objective |
26 | | int nredo = 1; |
27 | | |
28 | | bool verbose = false; |
29 | | /// whether to normalize centroids after each iteration (useful for inner |
30 | | /// product clustering) |
31 | | bool spherical = false; |
32 | | /// round centroids coordinates to integer after each iteration? |
33 | | bool int_centroids = false; |
34 | | /// re-train index after each iteration? |
35 | | bool update_index = false; |
36 | | |
37 | | /// Use the subset of centroids provided as input and do not change them |
38 | | /// during iterations |
39 | | bool frozen_centroids = false; |
40 | | /// If fewer than this number of training vectors per centroid are provided, |
41 | | /// writes a warning. Note that fewer than 1 point per centroid raises an |
42 | | /// exception. |
43 | | int min_points_per_centroid = 39; |
44 | | /// to limit size of dataset, otherwise the training set is subsampled |
45 | | int max_points_per_centroid = 256; |
46 | | /// seed for the random number generator. |
47 | | /// negative values lead to seeding an internal rng with |
48 | | /// std::high_resolution_clock. |
49 | | int seed = 1234; |
50 | | |
51 | | /// when the training set is encoded, batch size of the codec decoder |
52 | | size_t decode_block_size = 32768; |
53 | | |
54 | | /// whether to check for NaNs in an input data |
55 | | bool check_input_data_for_NaNs = true; |
56 | | |
57 | | /// Whether to use splitmix64-based random number generator for subsampling, |
58 | | /// which is faster, but may pick duplicate points. |
59 | | bool use_faster_subsampling = false; |
60 | | }; |
61 | | |
62 | | struct ClusteringIterationStats { |
63 | | float obj; ///< objective values (sum of distances reported by index) |
64 | | double time; ///< seconds for iteration |
65 | | double time_search; ///< seconds for just search |
66 | | double imbalance_factor; ///< imbalance factor of iteration |
67 | | int nsplit; ///< number of cluster splits |
68 | | }; |
69 | | |
70 | | /** K-means clustering based on assignment - centroid update iterations |
71 | | * |
72 | | * The clustering is based on an Index object that assigns training |
73 | | * points to the centroids. Therefore, at each iteration the centroids |
74 | | * are added to the index. |
75 | | * |
76 | | * On output, the centoids table is set to the latest version |
77 | | * of the centroids and they are also added to the index. If the |
78 | | * centroids table it is not empty on input, it is also used for |
79 | | * initialization. |
80 | | * |
81 | | */ |
82 | | struct Clustering : ClusteringParameters { |
83 | | size_t d; ///< dimension of the vectors |
84 | | size_t k; ///< nb of centroids |
85 | | |
86 | | /** centroids (k * d) |
87 | | * if centroids are set on input to train, they will be used as |
88 | | * initialization |
89 | | */ |
90 | | std::vector<float> centroids; |
91 | | |
92 | | /// stats at every iteration of clustering |
93 | | std::vector<ClusteringIterationStats> iteration_stats; |
94 | | |
95 | | Clustering(int d, int k); |
96 | | Clustering(int d, int k, const ClusteringParameters& cp); |
97 | | |
98 | | /** run k-means training |
99 | | * |
100 | | * @param x training vectors, size n * d |
101 | | * @param index index used for assignment |
102 | | * @param x_weights weight associated to each vector: NULL or size n |
103 | | */ |
104 | | virtual void train( |
105 | | idx_t n, |
106 | | const float* x, |
107 | | faiss::Index& index, |
108 | | const float* x_weights = nullptr); |
109 | | |
110 | | /** run with encoded vectors |
111 | | * |
112 | | * win addition to train()'s parameters takes a codec as parameter |
113 | | * to decode the input vectors. |
114 | | * |
115 | | * @param codec codec used to decode the vectors (nullptr = |
116 | | * vectors are in fact floats) |
117 | | */ |
118 | | void train_encoded( |
119 | | idx_t nx, |
120 | | const uint8_t* x_in, |
121 | | const Index* codec, |
122 | | Index& index, |
123 | | const float* weights = nullptr); |
124 | | |
125 | | /// Post-process the centroids after each centroid update. |
126 | | /// includes optional L2 normalization and nearest integer rounding |
127 | | void post_process_centroids(); |
128 | | |
129 | 0 | virtual ~Clustering() {} |
130 | | }; |
131 | | |
132 | | /** Exact 1D clustering algorithm |
133 | | * |
134 | | * Since it does not use an index, it does not overload the train() function |
135 | | */ |
136 | | struct Clustering1D : Clustering { |
137 | | explicit Clustering1D(int k); |
138 | | |
139 | | Clustering1D(int k, const ClusteringParameters& cp); |
140 | | |
141 | | void train_exact(idx_t n, const float* x); |
142 | | |
143 | 0 | virtual ~Clustering1D() {} |
144 | | }; |
145 | | |
146 | | struct ProgressiveDimClusteringParameters : ClusteringParameters { |
147 | | int progressive_dim_steps; ///< number of incremental steps |
148 | | bool apply_pca; ///< apply PCA on input |
149 | | |
150 | | ProgressiveDimClusteringParameters(); |
151 | | }; |
152 | | |
153 | | /** generates an index suitable for clustering when called */ |
154 | | struct ProgressiveDimIndexFactory { |
155 | | /// ownership transferred to caller |
156 | | virtual Index* operator()(int dim); |
157 | | |
158 | 0 | virtual ~ProgressiveDimIndexFactory() {} |
159 | | }; |
160 | | |
161 | | /** K-means clustering with progressive dimensions used |
162 | | * |
163 | | * The clustering first happens in dim 1, then with exponentially increasing |
164 | | * dimension until d (I steps). This is typically applied after a PCA |
165 | | * transformation (optional). Reference: |
166 | | * |
167 | | * "Improved Residual Vector Quantization for High-dimensional Approximate |
168 | | * Nearest Neighbor Search" |
169 | | * |
170 | | * Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15 |
171 | | * |
172 | | * https://arxiv.org/abs/1509.05195 |
173 | | */ |
174 | | struct ProgressiveDimClustering : ProgressiveDimClusteringParameters { |
175 | | size_t d; ///< dimension of the vectors |
176 | | size_t k; ///< nb of centroids |
177 | | |
178 | | /** centroids (k * d) */ |
179 | | std::vector<float> centroids; |
180 | | |
181 | | /// stats at every iteration of clustering |
182 | | std::vector<ClusteringIterationStats> iteration_stats; |
183 | | |
184 | | ProgressiveDimClustering(int d, int k); |
185 | | ProgressiveDimClustering( |
186 | | int d, |
187 | | int k, |
188 | | const ProgressiveDimClusteringParameters& cp); |
189 | | |
190 | | void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory); |
191 | | |
192 | 0 | virtual ~ProgressiveDimClustering() {} |
193 | | }; |
194 | | |
195 | | /** simplified interface |
196 | | * |
197 | | * @param d dimension of the data |
198 | | * @param n nb of training vectors |
199 | | * @param k nb of output centroids |
200 | | * @param x training set (size n * d) |
201 | | * @param centroids output centroids (size k * d) |
202 | | * @return final quantization error |
203 | | */ |
204 | | float kmeans_clustering( |
205 | | size_t d, |
206 | | size_t n, |
207 | | size_t k, |
208 | | const float* x, |
209 | | float* centroids); |
210 | | |
211 | | } // namespace faiss |
212 | | |
213 | | #endif |