/root/doris/be/src/olap/collection_statistics.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "collection_statistics.h"

#include <set>
#include <sstream>
#include <stack>

#include "common/exception.h"
#include "olap/rowset/rowset.h"
#include "olap/rowset/rowset_reader.h"
#include "olap/rowset/segment_v2/index_file_reader.h"
#include "olap/rowset/segment_v2/index_reader_helper.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
#include "util/uid_util.h"
#include "vec/exprs/vexpr.h"
#include "vec/exprs/vexpr_context.h"
#include "vec/exprs/vliteral.h"
#include "vec/exprs/vslot_ref.h"

namespace doris {
#include "common/compile_check_begin.h"

Status CollectionStatistics::collect(
        RuntimeState* state, const std::vector<RowSetSplits>& rs_splits,
        const TabletSchemaSPtr& tablet_schema,
        const vectorized::VExprContextSPtrs& common_expr_ctxs_push_down, io::IOContext* io_ctx) {
    std::unordered_map<std::wstring, CollectInfo> collect_infos;
    RETURN_IF_ERROR(
            extract_collect_info(state, common_expr_ctxs_push_down, tablet_schema, &collect_infos));
    if (collect_infos.empty()) {
        LOG(WARNING) << "Index statistics collection: no collect info extracted.";
        return Status::OK();
    }

    for (const auto& rs_split : rs_splits) {
        const auto& rs_reader = rs_split.rs_reader;
        auto rowset = rs_reader->rowset();
        auto num_segments = rowset->num_segments();
        for (int32_t seg_id = 0; seg_id < num_segments; ++seg_id) {
            auto status =
                    process_segment(rowset, seg_id, tablet_schema.get(), collect_infos, io_ctx);
            if (!status.ok()) {
                if (status.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND ||
                    status.code() == ErrorCode::INVERTED_INDEX_BYPASS) {
                    LOG(ERROR) << "Index statistics collection failed: " << status.to_string();
                } else {
                    return status;
                }
            }
        }
    }

    // Build a single-line log with query_id, tablet_ids, and per-field term statistics
    if (VLOG_IS_ON(1)) {
        std::set<int64_t> tablet_ids;
        for (const auto& rs_split : rs_splits) {
            if (rs_split.rs_reader && rs_split.rs_reader->rowset()) {
                tablet_ids.insert(rs_split.rs_reader->rowset()->rowset_meta()->tablet_id());
            }
        }

        std::ostringstream oss;
        oss << "CollectionStatistics: query_id=" << print_id(state->query_id());

        oss << ", tablet_ids=[";
        bool first_tablet = true;
        for (int64_t tid : tablet_ids) {
            if (!first_tablet) oss << ",";
            oss << tid;
            first_tablet = false;
        }
        oss << "]";

        oss << ", total_num_docs=" << _total_num_docs;

        for (const auto& [ws_field_name, num_tokens] : _total_num_tokens) {
            oss << ", {field=" << StringHelper::to_string(ws_field_name)
                << ", num_tokens=" << num_tokens << ", terms=[";

            bool first_term = true;
            for (const auto& [term, doc_freq] : _term_doc_freqs.at(ws_field_name)) {
                if (!first_term) oss << ", ";
                oss << "(" << StringHelper::to_string(term) << ":" << doc_freq << ")";
                first_term = false;
            }
            oss << "]}";
        }

        VLOG(1) << oss.str();
    }

    return Status::OK();
}

Status CollectionStatistics::extract_collect_info(
        RuntimeState* state, const vectorized::VExprContextSPtrs& common_expr_ctxs_push_down,
        const TabletSchemaSPtr& tablet_schema, CollectInfoMap* collect_infos) {
    DCHECK(collect_infos != nullptr);

    std::unordered_map<TExprNodeType::type, PredicateCollectorPtr> collectors;
    collectors[TExprNodeType::MATCH_PRED] = std::make_unique<MatchPredicateCollector>();
    collectors[TExprNodeType::SEARCH_EXPR] = std::make_unique<SearchPredicateCollector>();

    for (const auto& root_expr_ctx : common_expr_ctxs_push_down) {
        const auto& root_expr = root_expr_ctx->root();
        if (root_expr == nullptr) {
            continue;
        }

        std::stack<vectorized::VExprSPtr> stack;
        stack.emplace(root_expr);

        while (!stack.empty()) {
            auto expr = stack.top();
            stack.pop();

            if (!expr) {
                continue;
            }

            auto collector_it = collectors.find(expr->node_type());
            if (collector_it != collectors.end()) {
                RETURN_IF_ERROR(
                        collector_it->second->collect(state, tablet_schema, expr, collect_infos));
            }

            const auto& children = expr->children();
            for (const auto& child : children) {
                stack.push(child);
            }
        }
    }

    LOG(INFO) << "Extracted collect info for " << collect_infos->size() << " fields";

    return Status::OK();
}

Status CollectionStatistics::process_segment(const RowsetSharedPtr& rowset, int32_t seg_id,
                                             const TabletSchema* tablet_schema,
                                             const CollectInfoMap& collect_infos,
                                             io::IOContext* io_ctx) {
    auto seg_path = DORIS_TRY(rowset->segment_path(seg_id));
    auto rowset_meta = rowset->rowset_meta();

    auto idx_file_reader = std::make_unique<IndexFileReader>(
            rowset_meta->fs(),
            std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)},
            tablet_schema->get_inverted_index_storage_format(),
            rowset_meta->inverted_index_file_info(seg_id));
    RETURN_IF_ERROR(idx_file_reader->init(config::inverted_index_read_buffer_size, io_ctx));

    int32_t total_seg_num_docs = 0;

    for (const auto& [ws_field_name, collect_info] : collect_infos) {
        lucene::search::IndexSearcher* index_searcher = nullptr;
        lucene::index::IndexReader* index_reader = nullptr;

#ifdef BE_TEST
        auto compound_reader = DORIS_TRY(idx_file_reader->open(collect_info.index_meta, io_ctx));
        auto* reader = lucene::index::IndexReader::open(compound_reader.get());
        auto searcher_ptr = std::make_shared<lucene::search::IndexSearcher>(reader, true);
        index_searcher = searcher_ptr.get();
        index_reader = index_searcher->getReader();
#else
        InvertedIndexCacheHandle inverted_index_cache_handle;
        auto index_file_key = idx_file_reader->get_index_file_cache_key(collect_info.index_meta);
        InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key);

        if (!InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key,
                                                            &inverted_index_cache_handle)) {
            auto compound_reader =
                    DORIS_TRY(idx_file_reader->open(collect_info.index_meta, io_ctx));
            auto* reader = lucene::index::IndexReader::open(compound_reader.get());
            size_t reader_size = reader->getTermInfosRAMUsed();
            auto searcher_ptr = std::make_shared<lucene::search::IndexSearcher>(reader, true);
            auto* cache_value = new InvertedIndexSearcherCache::CacheValue(
                    std::move(searcher_ptr), reader_size, UnixMillis());
            InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value,
                                                           &inverted_index_cache_handle);
        }

        auto searcher_variant = inverted_index_cache_handle.get_index_searcher();
        auto index_searcher_ptr = std::get<FulltextIndexSearcherPtr>(searcher_variant);
        index_searcher = index_searcher_ptr.get();
        index_reader = index_searcher->getReader();
#endif
        total_seg_num_docs = std::max(total_seg_num_docs, index_reader->maxDoc());

        _total_num_tokens[ws_field_name] +=
                index_reader->sumTotalTermFreq(ws_field_name.c_str()).value_or(0);

        for (const auto& term_info : collect_info.term_infos) {
            auto iter = TermIterator::create(io_ctx, false, index_reader, ws_field_name,
                                             term_info.get_single_term());
            _term_doc_freqs[ws_field_name][iter->term()] += iter->doc_freq();
        }
    }

    _total_num_docs += total_seg_num_docs;

    return Status::OK();
}

uint64_t CollectionStatistics::get_term_doc_freq_by_col(const std::wstring& lucene_col_name,
                                                        const std::wstring& term) {
    if (!_term_doc_freqs.contains(lucene_col_name)) {
        throw Exception(ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
                        "Index statistics collection failed: Not such column {}",
                        StringHelper::to_string(lucene_col_name));
    }

    if (!_term_doc_freqs[lucene_col_name].contains(term)) {
        throw Exception(ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
                        "Index statistics collection failed: Not such term {}",
                        StringHelper::to_string(term));
    }

    return _term_doc_freqs[lucene_col_name][term];
}

uint64_t CollectionStatistics::get_total_term_cnt_by_col(const std::wstring& lucene_col_name) {
    if (!_total_num_tokens.contains(lucene_col_name)) {
        throw Exception(ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
                        "Index statistics collection failed: Not such column {}",
                        StringHelper::to_string(lucene_col_name));
    }

    return _total_num_tokens[lucene_col_name];
}

uint64_t CollectionStatistics::get_doc_num() const {
    if (_total_num_docs == 0) {
        throw Exception(
                ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
                "Index statistics collection failed: No data available for SimilarityCollector");
    }

    return _total_num_docs;
}

float CollectionStatistics::get_or_calculate_avg_dl(const std::wstring& lucene_col_name) {
    auto iter = _avg_dl_by_col.find(lucene_col_name);
    if (iter != _avg_dl_by_col.end()) {
        return iter->second;
    }

    const uint64_t total_term_cnt = get_total_term_cnt_by_col(lucene_col_name);
    const uint64_t total_doc_cnt = get_doc_num();
    float avg_dl = total_doc_cnt > 0 ? float((double)total_term_cnt / (double)total_doc_cnt) : 0.0F;
    _avg_dl_by_col[lucene_col_name] = avg_dl;
    return avg_dl;
}

float CollectionStatistics::get_or_calculate_idf(const std::wstring& lucene_col_name,
                                                 const std::wstring& term) {
    auto iter = _idf_by_col_term.find(lucene_col_name);
    if (iter != _idf_by_col_term.end()) {
        auto term_iter = iter->second.find(term);
        if (term_iter != iter->second.end()) {
            return term_iter->second;
        }
    }

    const uint64_t doc_num = get_doc_num();
    const uint64_t doc_freq = get_term_doc_freq_by_col(lucene_col_name, term);
    auto idf = (float)std::log(1 + ((double)doc_num - (double)doc_freq + (double)0.5) /
                                           ((double)doc_freq + (double)0.5));
    _idf_by_col_term[lucene_col_name][term] = idf;
    return idf;
}

#include "common/compile_check_end.h"
} // namespace doris

Coverage Report

Created: 2026-03-19 13:42

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "collection_statistics.h"
19
20		#include <set>
21		#include <sstream>
22		#include <stack>
23
24		#include "common/exception.h"
25		#include "olap/rowset/rowset.h"
26		#include "olap/rowset/rowset_reader.h"
27		#include "olap/rowset/segment_v2/index_file_reader.h"
28		#include "olap/rowset/segment_v2/index_reader_helper.h"
29		#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
30		#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
31		#include "util/uid_util.h"
32		#include "vec/exprs/vexpr.h"
33		#include "vec/exprs/vexpr_context.h"
34		#include "vec/exprs/vliteral.h"
35		#include "vec/exprs/vslot_ref.h"
36
37		namespace doris {
38		#include "common/compile_check_begin.h"
39
40		Status CollectionStatistics::collect(
41		RuntimeState* state, const std::vector<RowSetSplits>& rs_splits,
42		const TabletSchemaSPtr& tablet_schema,
43	10	const vectorized::VExprContextSPtrs& common_expr_ctxs_push_down, io::IOContext* io_ctx) {
44	10	std::unordered_map<std::wstring, CollectInfo> collect_infos;
45	10	RETURN_IF_ERROR(
46	10	extract_collect_info(state, common_expr_ctxs_push_down, tablet_schema, &collect_infos));
47	10	if (collect_infos.empty()) {
48	10	LOG(WARNING) << "Index statistics collection: no collect info extracted.";
49	10	return Status::OK();
50	10	}
51
52	0	for (const auto& rs_split : rs_splits) {
53	0	const auto& rs_reader = rs_split.rs_reader;
54	0	auto rowset = rs_reader->rowset();
55	0	auto num_segments = rowset->num_segments();
56	0	for (int32_t seg_id = 0; seg_id < num_segments; ++seg_id) {
57	0	auto status =
58	0	process_segment(rowset, seg_id, tablet_schema.get(), collect_infos, io_ctx);
59	0	if (!status.ok()) {
60	0	if (status.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND \|\|
61	0	status.code() == ErrorCode::INVERTED_INDEX_BYPASS) {
62	0	LOG(ERROR) << "Index statistics collection failed: " << status.to_string();
63	0	} else {
64	0	return status;
65	0	}
66	0	}
67	0	}
68	0	}
69
70		// Build a single-line log with query_id, tablet_ids, and per-field term statistics
71	0	if (VLOG_IS_ON(1)) {
72	0	std::set<int64_t> tablet_ids;
73	0	for (const auto& rs_split : rs_splits) {
74	0	if (rs_split.rs_reader && rs_split.rs_reader->rowset()) {
75	0	tablet_ids.insert(rs_split.rs_reader->rowset()->rowset_meta()->tablet_id());
76	0	}
77	0	}
78
79	0	std::ostringstream oss;
80	0	oss << "CollectionStatistics: query_id=" << print_id(state->query_id());
81
82	0	oss << ", tablet_ids=[";
83	0	bool first_tablet = true;
84	0	for (int64_t tid : tablet_ids) {
85	0	if (!first_tablet) oss << ",";
86	0	oss << tid;
87	0	first_tablet = false;
88	0	}
89	0	oss << "]";
90
91	0	oss << ", total_num_docs=" << _total_num_docs;
92
93	0	for (const auto& [ws_field_name, num_tokens] : _total_num_tokens) {
94	0	oss << ", {field=" << StringHelper::to_string(ws_field_name)
95	0	<< ", num_tokens=" << num_tokens << ", terms=[";
96
97	0	bool first_term = true;
98	0	for (const auto& [term, doc_freq] : _term_doc_freqs.at(ws_field_name)) {
99	0	if (!first_term) oss << ", ";
100	0	oss << "(" << StringHelper::to_string(term) << ":" << doc_freq << ")";
101	0	first_term = false;
102	0	}
103	0	oss << "]}";
104	0	}
105
106	0	VLOG(1) << oss.str();
107	0	}
108
109	0	return Status::OK();
110	0	}
111
112		Status CollectionStatistics::extract_collect_info(
113		RuntimeState* state, const vectorized::VExprContextSPtrs& common_expr_ctxs_push_down,
114	10	const TabletSchemaSPtr& tablet_schema, CollectInfoMap* collect_infos) {
115	10	DCHECK(collect_infos != nullptr);
116
117	10	std::unordered_map<TExprNodeType::type, PredicateCollectorPtr> collectors;
118	10	collectors[TExprNodeType::MATCH_PRED] = std::make_unique<MatchPredicateCollector>();
119	10	collectors[TExprNodeType::SEARCH_EXPR] = std::make_unique<SearchPredicateCollector>();
120
121	10	for (const auto& root_expr_ctx : common_expr_ctxs_push_down) {
122	10	const auto& root_expr = root_expr_ctx->root();
123	10	if (root_expr == nullptr) {
124	0	continue;
125	0	}
126
127	10	std::stack<vectorized::VExprSPtr> stack;
128	10	stack.emplace(root_expr);
129
130	43	while (!stack.empty()) {
131	33	auto expr = stack.top();
132	33	stack.pop();
133
134	33	if (!expr) {
135	0	continue;
136	0	}
137
138	33	auto collector_it = collectors.find(expr->node_type());
139	33	if (collector_it != collectors.end()) {
140	9	RETURN_IF_ERROR(
141	9	collector_it->second->collect(state, tablet_schema, expr, collect_infos));
142	9	}
143
144	33	const auto& children = expr->children();
145	33	for (const auto& child : children) {
146	23	stack.push(child);
147	23	}
148	33	}
149	10	}
150
151	10	LOG(INFO) << "Extracted collect info for " << collect_infos->size() << " fields";
152
153	10	return Status::OK();
154	10	}
155
156		Status CollectionStatistics::process_segment(const RowsetSharedPtr& rowset, int32_t seg_id,
157		const TabletSchema* tablet_schema,
158		const CollectInfoMap& collect_infos,
159	0	io::IOContext* io_ctx) {
160	0	auto seg_path = DORIS_TRY(rowset->segment_path(seg_id));
161	0	auto rowset_meta = rowset->rowset_meta();
162
163	0	auto idx_file_reader = std::make_unique<IndexFileReader>(
164	0	rowset_meta->fs(),
165	0	std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)},
166	0	tablet_schema->get_inverted_index_storage_format(),
167	0	rowset_meta->inverted_index_file_info(seg_id));
168	0	RETURN_IF_ERROR(idx_file_reader->init(config::inverted_index_read_buffer_size, io_ctx));
169
170	0	int32_t total_seg_num_docs = 0;
171
172	0	for (const auto& [ws_field_name, collect_info] : collect_infos) {
173	0	lucene::search::IndexSearcher* index_searcher = nullptr;
174	0	lucene::index::IndexReader* index_reader = nullptr;
175
176	0	#ifdef BE_TEST
177	0	auto compound_reader = DORIS_TRY(idx_file_reader->open(collect_info.index_meta, io_ctx));
178	0	auto* reader = lucene::index::IndexReader::open(compound_reader.get());
179	0	auto searcher_ptr = std::make_shared<lucene::search::IndexSearcher>(reader, true);
180	0	index_searcher = searcher_ptr.get();
181	0	index_reader = index_searcher->getReader();
182		#else
183		InvertedIndexCacheHandle inverted_index_cache_handle;
184		auto index_file_key = idx_file_reader->get_index_file_cache_key(collect_info.index_meta);
185		InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key);
186
187		if (!InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key,
188		&inverted_index_cache_handle)) {
189		auto compound_reader =
190		DORIS_TRY(idx_file_reader->open(collect_info.index_meta, io_ctx));
191		auto* reader = lucene::index::IndexReader::open(compound_reader.get());
192		size_t reader_size = reader->getTermInfosRAMUsed();
193		auto searcher_ptr = std::make_shared<lucene::search::IndexSearcher>(reader, true);
194		auto* cache_value = new InvertedIndexSearcherCache::CacheValue(
195		std::move(searcher_ptr), reader_size, UnixMillis());
196		InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value,
197		&inverted_index_cache_handle);
198		}
199
200		auto searcher_variant = inverted_index_cache_handle.get_index_searcher();
201		auto index_searcher_ptr = std::get<FulltextIndexSearcherPtr>(searcher_variant);
202		index_searcher = index_searcher_ptr.get();
203		index_reader = index_searcher->getReader();
204		#endif
205	0	total_seg_num_docs = std::max(total_seg_num_docs, index_reader->maxDoc());
206
207	0	_total_num_tokens[ws_field_name] +=
208	0	index_reader->sumTotalTermFreq(ws_field_name.c_str()).value_or(0);
209
210	0	for (const auto& term_info : collect_info.term_infos) {
211	0	auto iter = TermIterator::create(io_ctx, false, index_reader, ws_field_name,
212	0	term_info.get_single_term());
213	0	_term_doc_freqs[ws_field_name][iter->term()] += iter->doc_freq();
214	0	}
215	0	}
216
217	0	_total_num_docs += total_seg_num_docs;
218
219	0	return Status::OK();
220	0	}
221
222		uint64_t CollectionStatistics::get_term_doc_freq_by_col(const std::wstring& lucene_col_name,
223	23	const std::wstring& term) {
224	23	if (!_term_doc_freqs.contains(lucene_col_name)) {
225	1	throw Exception(ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
226	1	"Index statistics collection failed: Not such column {}",
227	1	StringHelper::to_string(lucene_col_name));
228	1	}
229
230	22	if (!_term_doc_freqs[lucene_col_name].contains(term)) {
231	0	throw Exception(ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
232	0	"Index statistics collection failed: Not such term {}",
233	0	StringHelper::to_string(term));
234	0	}
235
236	22	return _term_doc_freqs[lucene_col_name][term];
237	22	}
238
239	13	uint64_t CollectionStatistics::get_total_term_cnt_by_col(const std::wstring& lucene_col_name) {
240	13	if (!_total_num_tokens.contains(lucene_col_name)) {
241	2	throw Exception(ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
242	2	"Index statistics collection failed: Not such column {}",
243	2	StringHelper::to_string(lucene_col_name));
244	2	}
245
246	11	return _total_num_tokens[lucene_col_name];
247	13	}
248
249	33	uint64_t CollectionStatistics::get_doc_num() const {
250	33	if (_total_num_docs == 0) {
251	3	throw Exception(
252	3	ErrorCode::INVERTED_INDEX_CLUCENE_ERROR,
253	3	"Index statistics collection failed: No data available for SimilarityCollector");
254	3	}
255
256	30	return _total_num_docs;
257	33	}
258
259	12	float CollectionStatistics::get_or_calculate_avg_dl(const std::wstring& lucene_col_name) {
260	12	auto iter = _avg_dl_by_col.find(lucene_col_name);
261	12	if (iter != _avg_dl_by_col.end()) {
262	2	return iter->second;
263	2	}
264
265	10	const uint64_t total_term_cnt = get_total_term_cnt_by_col(lucene_col_name);
266	10	const uint64_t total_doc_cnt = get_doc_num();
267	10	float avg_dl = total_doc_cnt > 0 ? float((double)total_term_cnt / (double)total_doc_cnt) : 0.0F;
268	10	_avg_dl_by_col[lucene_col_name] = avg_dl;
269	10	return avg_dl;
270	12	}
271
272		float CollectionStatistics::get_or_calculate_idf(const std::wstring& lucene_col_name,
273	22	const std::wstring& term) {
274	22	auto iter = _idf_by_col_term.find(lucene_col_name);
275	22	if (iter != _idf_by_col_term.end()) {
276	12	auto term_iter = iter->second.find(term);
277	12	if (term_iter != iter->second.end()) {
278	1	return term_iter->second;
279	1	}
280	12	}
281
282	21	const uint64_t doc_num = get_doc_num();
283	21	const uint64_t doc_freq = get_term_doc_freq_by_col(lucene_col_name, term);
284	21	auto idf = (float)std::log(1 + ((double)doc_num - (double)doc_freq + (double)0.5) /
285	21	((double)doc_freq + (double)0.5));
286	21	_idf_by_col_term[lucene_col_name][term] = idf;
287	21	return idf;
288	22	}
289
290		#include "common/compile_check_end.h"
291		} // namespace doris