/root/doris/contrib/faiss/faiss/impl/PolysemousTraining.h

Source
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

// -*- c++ -*-

#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
#define FAISS_POLYSEMOUS_TRAINING_INCLUDED

#include <faiss/impl/ProductQuantizer.h>

namespace faiss {

/// parameters used for the simulated annealing method
struct SimulatedAnnealingParameters {
    // optimization parameters
    double init_temperature = 0.7; // init probability of accepting a bad swap
    // at each iteration the temp is multiplied by this
    double temperature_decay = 0.9997893011688015; // = 0.9^(1/500)
    int n_iter = 500000;                           // nb of iterations
    int n_redo = 2; // nb of runs of the simulation
    int seed = 123; // random seed
    int verbose = 0;
    bool only_bit_flips = false; // restrict permutation changes to bit flips
    bool init_random =
            false; // initialize with a random permutation (not identity)

    // set reasonable defaults
    SimulatedAnnealingParameters() {}
};

/// abstract class for the loss function
struct PermutationObjective {
    int n;

    virtual double compute_cost(const int* perm) const = 0;

    // what would the cost update be if iw and jw were swapped?
    // default implementation just computes both and computes the difference
    virtual double cost_update(const int* perm, int iw, int jw) const;

    virtual ~PermutationObjective() {}
};

struct ReproduceDistancesObjective : PermutationObjective {
    double dis_weight_factor;

    static double sqr(double x) {
        return x * x;
    }

    // weighting of distances: it is more important to reproduce small
    // distances well
    double dis_weight(double x) const;

    std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
    const double* target_dis;       ///< wanted distances (size n^2)
    std::vector<double> weights;    ///< weights for each distance (size n^2)

    double get_source_dis(int i, int j) const;

    // cost = quadratic difference between actual distance and Hamming distance
    double compute_cost(const int* perm) const override;

    // what would the cost update be if iw and jw were swapped?
    // computed in O(n) instead of O(n^2) for the full re-computation
    double cost_update(const int* perm, int iw, int jw) const override;

    ReproduceDistancesObjective(
            int n,
            const double* source_dis_in,
            const double* target_dis_in,
            double dis_weight_factor);

    static void compute_mean_stdev(
            const double* tab,
            size_t n2,
            double* mean_out,
            double* stddev_out);

    void set_affine_target_dis(const double* source_dis_in);

    ~ReproduceDistancesObjective() override {}
};

struct RandomGenerator;

/// Simulated annealing optimization algorithm for permutations.
struct SimulatedAnnealingOptimizer : SimulatedAnnealingParameters {
    PermutationObjective* obj;
    int n;         ///< size of the permutation
    FILE* logfile; /// logs values of the cost function

    SimulatedAnnealingOptimizer(
            PermutationObjective* obj,
            const SimulatedAnnealingParameters& p);
    RandomGenerator* rnd;

    /// remember initial cost of optimization
    double init_cost;

    // main entry point. Perform the optimization loop, starting from
    // and modifying permutation in-place
    double optimize(int* perm);

    // run the optimization and return the best result in best_perm
    double run_optimization(int* best_perm);

    virtual ~SimulatedAnnealingOptimizer();
};

/// optimizes the order of indices in a ProductQuantizer
struct PolysemousTraining : SimulatedAnnealingParameters {
    enum Optimization_type_t {
        OT_None,
        OT_ReproduceDistances_affine, ///< default
        OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of
                                 ///< y-
    };
    Optimization_type_t optimization_type;

    /** use 1/4 of the training points for the optimization, with
     * max. ntrain_permutation. If ntrain_permutation == 0: train on
     * centroids */
    int ntrain_permutation;
    double dis_weight_factor; ///< decay of exp that weights distance loss

    /// refuse to train if it would require more than that amount of RAM
    size_t max_memory;

    // filename pattern for the logging of iterations
    std::string log_pattern;

    // sets default values
    PolysemousTraining();

    /// reorder the centroids so that the Hamming distance becomes a
    /// good approximation of the SDC distance (called by train)
    void optimize_pq_for_hamming(ProductQuantizer& pq, size_t n, const float* x)
            const;

    /// called by optimize_pq_for_hamming
    void optimize_ranking(ProductQuantizer& pq, size_t n, const float* x) const;
    /// called by optimize_pq_for_hamming
    void optimize_reproduce_distances(ProductQuantizer& pq) const;

    /// make sure we don't blow up the memory
    size_t memory_usage_per_thread(const ProductQuantizer& pq) const;
};

} // namespace faiss

#endif

Coverage Report

Created: 2025-11-14 03:48

Line	Count	Source
1		/*
2		* Copyright (c) Meta Platforms, Inc. and affiliates.
3		*
4		* This source code is licensed under the MIT license found in the
5		* LICENSE file in the root directory of this source tree.
6		*/
7
8		// -- c++ --
9
10		#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
11		#define FAISS_POLYSEMOUS_TRAINING_INCLUDED
12
13		#include <faiss/impl/ProductQuantizer.h>
14
15		namespace faiss {
16
17		/// parameters used for the simulated annealing method
18		struct SimulatedAnnealingParameters {
19		// optimization parameters
20		double init_temperature = 0.7; // init probability of accepting a bad swap
21		// at each iteration the temp is multiplied by this
22		double temperature_decay = 0.9997893011688015; // = 0.9^(1/500)
23		int n_iter = 500000; // nb of iterations
24		int n_redo = 2; // nb of runs of the simulation
25		int seed = 123; // random seed
26		int verbose = 0;
27		bool only_bit_flips = false; // restrict permutation changes to bit flips
28		bool init_random =
29		false; // initialize with a random permutation (not identity)
30
31		// set reasonable defaults
32	0	SimulatedAnnealingParameters() {}
33		};
34
35		/// abstract class for the loss function
36		struct PermutationObjective {
37		int n;
38
39		virtual double compute_cost(const int* perm) const = 0;
40
41		// what would the cost update be if iw and jw were swapped?
42		// default implementation just computes both and computes the difference
43		virtual double cost_update(const int* perm, int iw, int jw) const;
44
45	0	virtual ~PermutationObjective() {}
46		};
47
48		struct ReproduceDistancesObjective : PermutationObjective {
49		double dis_weight_factor;
50
51	0	static double sqr(double x) {
52	0	return x * x;
53	0	}
54
55		// weighting of distances: it is more important to reproduce small
56		// distances well
57		double dis_weight(double x) const;
58
59		std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
60		const double* target_dis; ///< wanted distances (size n^2)
61		std::vector<double> weights; ///< weights for each distance (size n^2)
62
63		double get_source_dis(int i, int j) const;
64
65		// cost = quadratic difference between actual distance and Hamming distance
66		double compute_cost(const int* perm) const override;
67
68		// what would the cost update be if iw and jw were swapped?
69		// computed in O(n) instead of O(n^2) for the full re-computation
70		double cost_update(const int* perm, int iw, int jw) const override;
71
72		ReproduceDistancesObjective(
73		int n,
74		const double* source_dis_in,
75		const double* target_dis_in,
76		double dis_weight_factor);
77
78		static void compute_mean_stdev(
79		const double* tab,
80		size_t n2,
81		double* mean_out,
82		double* stddev_out);
83
84		void set_affine_target_dis(const double* source_dis_in);
85
86	0	~ReproduceDistancesObjective() override {}
87		};
88
89		struct RandomGenerator;
90
91		/// Simulated annealing optimization algorithm for permutations.
92		struct SimulatedAnnealingOptimizer : SimulatedAnnealingParameters {
93		PermutationObjective* obj;
94		int n; ///< size of the permutation
95		FILE* logfile; /// logs values of the cost function
96
97		SimulatedAnnealingOptimizer(
98		PermutationObjective* obj,
99		const SimulatedAnnealingParameters& p);
100		RandomGenerator* rnd;
101
102		/// remember initial cost of optimization
103		double init_cost;
104
105		// main entry point. Perform the optimization loop, starting from
106		// and modifying permutation in-place
107		double optimize(int* perm);
108
109		// run the optimization and return the best result in best_perm
110		double run_optimization(int* best_perm);
111
112		virtual ~SimulatedAnnealingOptimizer();
113		};
114
115		/// optimizes the order of indices in a ProductQuantizer
116		struct PolysemousTraining : SimulatedAnnealingParameters {
117		enum Optimization_type_t {
118		OT_None,
119		OT_ReproduceDistances_affine, ///< default
120		OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of
121		///< y-
122		};
123		Optimization_type_t optimization_type;
124
125		/** use 1/4 of the training points for the optimization, with
126		* max. ntrain_permutation. If ntrain_permutation == 0: train on
127		* centroids */
128		int ntrain_permutation;
129		double dis_weight_factor; ///< decay of exp that weights distance loss
130
131		/// refuse to train if it would require more than that amount of RAM
132		size_t max_memory;
133
134		// filename pattern for the logging of iterations
135		std::string log_pattern;
136
137		// sets default values
138		PolysemousTraining();
139
140		/// reorder the centroids so that the Hamming distance becomes a
141		/// good approximation of the SDC distance (called by train)
142		void optimize_pq_for_hamming(ProductQuantizer& pq, size_t n, const float* x)
143		const;
144
145		/// called by optimize_pq_for_hamming
146		void optimize_ranking(ProductQuantizer& pq, size_t n, const float* x) const;
147		/// called by optimize_pq_for_hamming
148		void optimize_reproduce_distances(ProductQuantizer& pq) const;
149
150		/// make sure we don't blow up the memory
151		size_t memory_usage_per_thread(const ProductQuantizer& pq) const;
152		};
153
154		} // namespace faiss
155
156		#endif