Coverage Report

Created: 2025-11-14 03:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/contrib/faiss/faiss/impl/PolysemousTraining.h
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
// -*- c++ -*-
9
10
#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
11
#define FAISS_POLYSEMOUS_TRAINING_INCLUDED
12
13
#include <faiss/impl/ProductQuantizer.h>
14
15
namespace faiss {
16
17
/// parameters used for the simulated annealing method
18
struct SimulatedAnnealingParameters {
19
    // optimization parameters
20
    double init_temperature = 0.7; // init probability of accepting a bad swap
21
    // at each iteration the temp is multiplied by this
22
    double temperature_decay = 0.9997893011688015; // = 0.9^(1/500)
23
    int n_iter = 500000;                           // nb of iterations
24
    int n_redo = 2; // nb of runs of the simulation
25
    int seed = 123; // random seed
26
    int verbose = 0;
27
    bool only_bit_flips = false; // restrict permutation changes to bit flips
28
    bool init_random =
29
            false; // initialize with a random permutation (not identity)
30
31
    // set reasonable defaults
32
0
    SimulatedAnnealingParameters() {}
33
};
34
35
/// abstract class for the loss function
36
struct PermutationObjective {
37
    int n;
38
39
    virtual double compute_cost(const int* perm) const = 0;
40
41
    // what would the cost update be if iw and jw were swapped?
42
    // default implementation just computes both and computes the difference
43
    virtual double cost_update(const int* perm, int iw, int jw) const;
44
45
0
    virtual ~PermutationObjective() {}
46
};
47
48
struct ReproduceDistancesObjective : PermutationObjective {
49
    double dis_weight_factor;
50
51
0
    static double sqr(double x) {
52
0
        return x * x;
53
0
    }
54
55
    // weighting of distances: it is more important to reproduce small
56
    // distances well
57
    double dis_weight(double x) const;
58
59
    std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
60
    const double* target_dis;       ///< wanted distances (size n^2)
61
    std::vector<double> weights;    ///< weights for each distance (size n^2)
62
63
    double get_source_dis(int i, int j) const;
64
65
    // cost = quadratic difference between actual distance and Hamming distance
66
    double compute_cost(const int* perm) const override;
67
68
    // what would the cost update be if iw and jw were swapped?
69
    // computed in O(n) instead of O(n^2) for the full re-computation
70
    double cost_update(const int* perm, int iw, int jw) const override;
71
72
    ReproduceDistancesObjective(
73
            int n,
74
            const double* source_dis_in,
75
            const double* target_dis_in,
76
            double dis_weight_factor);
77
78
    static void compute_mean_stdev(
79
            const double* tab,
80
            size_t n2,
81
            double* mean_out,
82
            double* stddev_out);
83
84
    void set_affine_target_dis(const double* source_dis_in);
85
86
0
    ~ReproduceDistancesObjective() override {}
87
};
88
89
struct RandomGenerator;
90
91
/// Simulated annealing optimization algorithm for permutations.
92
struct SimulatedAnnealingOptimizer : SimulatedAnnealingParameters {
93
    PermutationObjective* obj;
94
    int n;         ///< size of the permutation
95
    FILE* logfile; /// logs values of the cost function
96
97
    SimulatedAnnealingOptimizer(
98
            PermutationObjective* obj,
99
            const SimulatedAnnealingParameters& p);
100
    RandomGenerator* rnd;
101
102
    /// remember initial cost of optimization
103
    double init_cost;
104
105
    // main entry point. Perform the optimization loop, starting from
106
    // and modifying permutation in-place
107
    double optimize(int* perm);
108
109
    // run the optimization and return the best result in best_perm
110
    double run_optimization(int* best_perm);
111
112
    virtual ~SimulatedAnnealingOptimizer();
113
};
114
115
/// optimizes the order of indices in a ProductQuantizer
116
struct PolysemousTraining : SimulatedAnnealingParameters {
117
    enum Optimization_type_t {
118
        OT_None,
119
        OT_ReproduceDistances_affine, ///< default
120
        OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of
121
                                 ///< y-
122
    };
123
    Optimization_type_t optimization_type;
124
125
    /** use 1/4 of the training points for the optimization, with
126
     * max. ntrain_permutation. If ntrain_permutation == 0: train on
127
     * centroids */
128
    int ntrain_permutation;
129
    double dis_weight_factor; ///< decay of exp that weights distance loss
130
131
    /// refuse to train if it would require more than that amount of RAM
132
    size_t max_memory;
133
134
    // filename pattern for the logging of iterations
135
    std::string log_pattern;
136
137
    // sets default values
138
    PolysemousTraining();
139
140
    /// reorder the centroids so that the Hamming distance becomes a
141
    /// good approximation of the SDC distance (called by train)
142
    void optimize_pq_for_hamming(ProductQuantizer& pq, size_t n, const float* x)
143
            const;
144
145
    /// called by optimize_pq_for_hamming
146
    void optimize_ranking(ProductQuantizer& pq, size_t n, const float* x) const;
147
    /// called by optimize_pq_for_hamming
148
    void optimize_reproduce_distances(ProductQuantizer& pq) const;
149
150
    /// make sure we don't blow up the memory
151
    size_t memory_usage_per_thread(const ProductQuantizer& pq) const;
152
};
153
154
} // namespace faiss
155
156
#endif