Coverage Report

Created: 2026-06-03 09:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/merger.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/merger.h"
19
20
#include <gen_cpp/olap_file.pb.h>
21
#include <gen_cpp/types.pb.h>
22
#include <stddef.h>
23
#include <unistd.h>
24
25
#include <algorithm>
26
#include <iterator>
27
#include <memory>
28
#include <mutex>
29
#include <numeric>
30
#include <ostream>
31
#include <shared_mutex>
32
#include <string>
33
#include <unordered_map>
34
#include <utility>
35
#include <vector>
36
37
#include "cloud/config.h"
38
#include "common/config.h"
39
#include "common/logging.h"
40
#include "common/status.h"
41
#include "core/block/block.h"
42
#include "storage/iterator/block_reader.h"
43
#include "storage/iterator/vertical_block_reader.h"
44
#include "storage/iterator/vertical_merge_iterator.h"
45
#include "storage/iterators.h"
46
#include "storage/olap_common.h"
47
#include "storage/olap_define.h"
48
#include "storage/rowid_conversion.h"
49
#include "storage/rowset/beta_rowset.h"
50
#include "storage/rowset/rowset.h"
51
#include "storage/rowset/rowset_meta.h"
52
#include "storage/rowset/rowset_writer.h"
53
#include "storage/segment/segment.h"
54
#include "storage/segment/segment_writer.h"
55
#include "storage/storage_engine.h"
56
#include "storage/tablet/base_tablet.h"
57
#include "storage/tablet/tablet.h"
58
#include "storage/tablet/tablet_fwd.h"
59
#include "storage/tablet/tablet_meta.h"
60
#include "storage/tablet/tablet_reader.h"
61
#include "storage/types.h"
62
#include "storage/utils.h"
63
#include "util/slice.h"
64
65
namespace doris {
66
Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type,
67
                              const TabletSchema& cur_tablet_schema,
68
                              const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
69
1.45k
                              RowsetWriter* dst_rowset_writer, Statistics* stats_output) {
70
1.45k
    if (!cur_tablet_schema.cluster_key_uids().empty()) {
71
0
        return Status::InternalError(
72
0
                "mow table with cluster keys does not support non vertical compaction");
73
0
    }
74
1.45k
    BlockReader reader;
75
1.45k
    TabletReader::ReaderParams reader_params;
76
1.45k
    reader_params.tablet = tablet;
77
1.45k
    reader_params.reader_type = reader_type;
78
79
1.45k
    TabletReadSource read_source;
80
1.45k
    read_source.rs_splits.reserve(src_rowset_readers.size());
81
1.56k
    for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) {
82
1.56k
        read_source.rs_splits.emplace_back(rs_reader);
83
1.56k
    }
84
1.45k
    read_source.fill_delete_predicates();
85
1.45k
    reader_params.set_read_source(std::move(read_source));
86
87
1.45k
    reader_params.version = dst_rowset_writer->version();
88
89
1.45k
    TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>();
90
1.45k
    merge_tablet_schema->copy_from(cur_tablet_schema);
91
92
    // Merge the columns in delete predicate that not in latest schema in to current tablet schema
93
1.45k
    for (auto& del_pred_rs : reader_params.delete_predicates) {
94
24
        merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema());
95
24
    }
96
1.45k
    reader_params.tablet_schema = merge_tablet_schema;
97
1.45k
    if (!tablet->tablet_schema()->cluster_key_uids().empty()) {
98
0
        reader_params.delete_bitmap = tablet->tablet_meta()->delete_bitmap_ptr();
99
0
    }
100
1.45k
    if (reader_params.reader_type == ReaderType::READER_BINLOG_COMPACTION) {
101
0
        reader_params.delete_bitmap = tablet->tablet_meta()->binlog_delvec_ptr();
102
0
    }
103
104
1.45k
    if (stats_output && stats_output->rowid_conversion) {
105
48
        reader_params.record_rowids = true;
106
48
        reader_params.rowid_conversion = stats_output->rowid_conversion;
107
48
        stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id());
108
48
    }
109
110
1.45k
    reader_params.return_columns.resize(cur_tablet_schema.num_columns());
111
1.45k
    std::iota(reader_params.return_columns.begin(), reader_params.return_columns.end(), 0);
112
1.45k
    reader_params.origin_return_columns = &reader_params.return_columns;
113
1.45k
    RETURN_IF_ERROR(reader.init(reader_params));
114
115
1.45k
    Block block = cur_tablet_schema.create_block(reader_params.return_columns);
116
1.45k
    size_t output_rows = 0;
117
1.45k
    bool eof = false;
118
4.24k
    while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) {
119
2.79k
        auto tablet_state = tablet->tablet_state();
120
2.79k
        if (tablet_state != TABLET_RUNNING && tablet_state != TABLET_NOTREADY) {
121
0
            tablet->clear_cache();
122
0
            return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more",
123
0
                                                 tablet->tablet_id());
124
0
        }
125
126
        // Read one block from block reader
127
2.79k
        RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof),
128
2.79k
                                       "failed to read next block when merging rowsets of tablet " +
129
2.79k
                                               std::to_string(tablet->tablet_id()));
130
2.79k
        RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->add_block(&block),
131
2.79k
                                       "failed to write block when merging rowsets of tablet " +
132
2.79k
                                               std::to_string(tablet->tablet_id()));
133
134
2.79k
        if (reader_params.record_rowids && block.rows() > 0) {
135
578
            std::vector<uint32_t> segment_num_rows;
136
578
            RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows));
137
578
            stats_output->rowid_conversion->add(reader.current_block_row_locations(),
138
578
                                                segment_num_rows);
139
578
        }
140
141
2.79k
        output_rows += block.rows();
142
2.79k
        block.clear_column_data();
143
2.79k
    }
144
1.45k
    if (ExecEnv::GetInstance()->storage_engine().stopped()) {
145
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
146
0
                                             tablet->tablet_id());
147
0
    }
148
149
1.45k
    if (stats_output != nullptr) {
150
1.45k
        stats_output->output_rows = output_rows;
151
1.45k
        stats_output->merged_rows = reader.merged_rows();
152
1.45k
        stats_output->filtered_rows = reader.filtered_rows();
153
1.45k
        stats_output->bytes_read_from_local = reader.stats().file_cache_stats.bytes_read_from_local;
154
1.45k
        stats_output->bytes_read_from_remote =
155
1.45k
                reader.stats().file_cache_stats.bytes_read_from_remote;
156
1.45k
        stats_output->cached_bytes_total = reader.stats().file_cache_stats.bytes_write_into_cache;
157
1.45k
        if (config::is_cloud_mode()) {
158
1.41k
            stats_output->cloud_local_read_time =
159
1.41k
                    reader.stats().file_cache_stats.local_io_timer / 1000;
160
1.41k
            stats_output->cloud_remote_read_time =
161
1.41k
                    reader.stats().file_cache_stats.remote_io_timer / 1000;
162
1.41k
        }
163
1.45k
    }
164
165
1.45k
    RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->flush(),
166
1.45k
                                   "failed to flush rowset when merging rowsets of tablet " +
167
1.45k
                                           std::to_string(tablet->tablet_id()));
168
169
1.45k
    return Status::OK();
170
1.45k
}
171
172
// split columns into several groups, make sure all keys in one group
173
// unique_key should consider sequence&delete column
174
void Merger::vertical_split_columns(const TabletSchema& tablet_schema,
175
                                    std::vector<std::vector<uint32_t>>* column_groups,
176
                                    std::vector<uint32_t>* key_group_cluster_key_idxes,
177
10.2k
                                    int32_t num_columns_per_group) {
178
10.2k
    size_t num_key_cols = tablet_schema.num_key_columns();
179
10.2k
    size_t total_cols = tablet_schema.num_columns();
180
10.2k
    std::vector<uint32_t> key_columns;
181
47.1k
    for (auto i = 0; i < num_key_cols; ++i) {
182
36.9k
        key_columns.emplace_back(i);
183
36.9k
    }
184
    // in unique key, sequence & delete sign column should merge with key columns
185
10.2k
    int32_t sequence_col_idx = -1;
186
10.2k
    int32_t delete_sign_idx = -1;
187
    // in key column compaction, seq_col real index is _num_key_columns
188
    // and delete_sign column is _block->columns() - 1
189
10.2k
    if (tablet_schema.keys_type() == KeysType::UNIQUE_KEYS) {
190
5.80k
        if (tablet_schema.has_sequence_col()) {
191
86
            sequence_col_idx = tablet_schema.sequence_col_idx();
192
86
            key_columns.emplace_back(sequence_col_idx);
193
86
        }
194
5.80k
        delete_sign_idx = tablet_schema.field_index(DELETE_SIGN);
195
5.80k
        if (delete_sign_idx != -1) {
196
5.79k
            key_columns.emplace_back(delete_sign_idx);
197
5.79k
        }
198
5.80k
        if (!tablet_schema.cluster_key_uids().empty()) {
199
442
            for (const auto& cid : tablet_schema.cluster_key_uids()) {
200
442
                auto idx = tablet_schema.field_index(cid);
201
442
                DCHECK(idx >= 0) << "could not find cluster key column with unique_id=" << cid
202
0
                                 << " in tablet schema, table_id=" << tablet_schema.table_id();
203
442
                if (idx >= num_key_cols) {
204
213
                    key_columns.emplace_back(idx);
205
213
                }
206
442
            }
207
            // tablet schema unique ids: [1, 2, 5, 3, 6, 4], [1 2] is key columns
208
            // cluster key unique ids: [3, 1, 4]
209
            // the key_columns should be [0, 1, 3, 5]
210
            // the key_group_cluster_key_idxes should be [2, 1, 3]
211
441
            for (const auto& cid : tablet_schema.cluster_key_uids()) {
212
441
                auto idx = tablet_schema.field_index(cid);
213
2.53k
                for (auto i = 0; i < key_columns.size(); ++i) {
214
2.53k
                    if (idx == key_columns[i]) {
215
442
                        key_group_cluster_key_idxes->emplace_back(i);
216
442
                        break;
217
442
                    }
218
2.53k
                }
219
441
            }
220
149
        }
221
5.80k
    }
222
10.2k
    VLOG_NOTICE << "sequence_col_idx=" << sequence_col_idx
223
46
                << ", delete_sign_idx=" << delete_sign_idx;
224
    // for duplicate no keys
225
10.2k
    if (!key_columns.empty()) {
226
10.1k
        column_groups->emplace_back(key_columns);
227
10.1k
    }
228
229
10.2k
    std::vector<uint32_t> value_columns;
230
231
79.9k
    for (size_t i = num_key_cols; i < total_cols; ++i) {
232
69.7k
        if (i == sequence_col_idx || i == delete_sign_idx ||
233
69.7k
            key_columns.end() != std::find(key_columns.begin(), key_columns.end(), i)) {
234
6.07k
            continue;
235
6.07k
        }
236
237
63.6k
        if (!value_columns.empty() && value_columns.size() % num_columns_per_group == 0) {
238
6.91k
            column_groups->push_back(value_columns);
239
6.91k
            value_columns.clear();
240
6.91k
        }
241
63.6k
        value_columns.push_back(cast_set<uint32_t>(i));
242
63.6k
    }
243
244
10.2k
    if (!value_columns.empty()) {
245
9.68k
        column_groups->push_back(value_columns);
246
9.68k
    }
247
10.2k
}
248
249
Status Merger::vertical_compact_one_group(
250
        BaseTabletSPtr tablet, ReaderType reader_type, const TabletSchema& tablet_schema,
251
        bool is_key, const std::vector<uint32_t>& column_group, RowSourcesBuffer* row_source_buf,
252
        const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
253
        RowsetWriter* dst_rowset_writer, uint32_t max_rows_per_segment, Statistics* stats_output,
254
        std::vector<uint32_t> key_group_cluster_key_idxes, int64_t batch_size,
255
26.8k
        CompactionSampleInfo* sample_info, bool enable_sparse_optimization) {
256
    // build tablet reader
257
26.8k
    VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment;
258
26.8k
    VerticalBlockReader reader(row_source_buf);
259
26.8k
    TabletReader::ReaderParams reader_params;
260
26.8k
    reader_params.is_key_column_group = is_key;
261
26.8k
    reader_params.key_group_cluster_key_idxes = key_group_cluster_key_idxes;
262
26.8k
    reader_params.tablet = tablet;
263
26.8k
    reader_params.reader_type = reader_type;
264
26.8k
    reader_params.enable_sparse_optimization = enable_sparse_optimization;
265
266
26.8k
    TabletReadSource read_source;
267
26.8k
    read_source.rs_splits.reserve(src_rowset_readers.size());
268
216k
    for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) {
269
216k
        read_source.rs_splits.emplace_back(rs_reader);
270
216k
    }
271
26.8k
    read_source.fill_delete_predicates();
272
26.8k
    reader_params.set_read_source(std::move(read_source));
273
274
26.8k
    reader_params.version = dst_rowset_writer->version();
275
276
26.8k
    TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>();
277
26.8k
    merge_tablet_schema->copy_from(tablet_schema);
278
279
26.8k
    for (auto& del_pred_rs : reader_params.delete_predicates) {
280
670
        merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema());
281
670
    }
282
283
26.8k
    reader_params.tablet_schema = merge_tablet_schema;
284
26.8k
    bool has_cluster_key = false;
285
26.8k
    if (!tablet->tablet_schema()->cluster_key_uids().empty()) {
286
420
        reader_params.delete_bitmap = tablet->tablet_meta()->delete_bitmap_ptr();
287
420
        has_cluster_key = true;
288
420
    }
289
26.8k
    if (reader_params.reader_type == ReaderType::READER_BINLOG_COMPACTION) {
290
0
        reader_params.delete_bitmap = tablet->tablet_meta()->binlog_delvec_ptr();
291
0
    }
292
293
26.8k
    if (is_key && stats_output && stats_output->rowid_conversion) {
294
6.08k
        reader_params.record_rowids = true;
295
6.08k
        reader_params.rowid_conversion = stats_output->rowid_conversion;
296
6.08k
        stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id());
297
6.08k
    }
298
299
26.8k
    reader_params.return_columns = column_group;
300
26.8k
    reader_params.origin_return_columns = &reader_params.return_columns;
301
26.8k
    reader_params.batch_size = batch_size;
302
26.8k
    RETURN_IF_ERROR(reader.init(reader_params, sample_info));
303
304
26.8k
    Block block = tablet_schema.create_block(reader_params.return_columns);
305
26.8k
    size_t output_rows = 0;
306
26.8k
    bool eof = false;
307
64.5k
    while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) {
308
37.7k
        auto tablet_state = tablet->tablet_state();
309
37.7k
        if (tablet_state != TABLET_RUNNING && tablet_state != TABLET_NOTREADY) {
310
0
            tablet->clear_cache();
311
0
            return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more",
312
0
                                                 tablet->tablet_id());
313
0
        }
314
        // Read one block from block reader
315
37.7k
        RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof),
316
37.7k
                                       "failed to read next block when merging rowsets of tablet " +
317
37.7k
                                               std::to_string(tablet->tablet_id()));
318
37.7k
        RETURN_NOT_OK_STATUS_WITH_WARN(
319
37.7k
                dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment,
320
37.7k
                                               has_cluster_key),
321
37.7k
                "failed to write block when merging rowsets of tablet " +
322
37.7k
                        std::to_string(tablet->tablet_id()));
323
324
37.7k
        if (is_key && reader_params.record_rowids && block.rows() > 0) {
325
6.33k
            std::vector<uint32_t> segment_num_rows;
326
6.33k
            RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows));
327
6.33k
            stats_output->rowid_conversion->add(reader.current_block_row_locations(),
328
6.33k
                                                segment_num_rows);
329
6.33k
        }
330
37.7k
        output_rows += block.rows();
331
37.7k
        block.clear_column_data();
332
37.7k
    }
333
26.8k
    if (ExecEnv::GetInstance()->storage_engine().stopped()) {
334
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
335
0
                                             tablet->tablet_id());
336
0
    }
337
338
26.8k
    if (stats_output != nullptr) {
339
26.7k
        if (is_key) {
340
10.1k
            stats_output->output_rows = output_rows;
341
10.1k
            stats_output->merged_rows = reader.merged_rows();
342
10.1k
            stats_output->filtered_rows = reader.filtered_rows();
343
10.1k
        }
344
26.7k
        stats_output->bytes_read_from_local = reader.stats().file_cache_stats.bytes_read_from_local;
345
26.7k
        stats_output->bytes_read_from_remote =
346
26.7k
                reader.stats().file_cache_stats.bytes_read_from_remote;
347
26.7k
        stats_output->cached_bytes_total = reader.stats().file_cache_stats.bytes_write_into_cache;
348
26.7k
        if (config::is_cloud_mode()) {
349
18.9k
            stats_output->cloud_local_read_time =
350
18.9k
                    reader.stats().file_cache_stats.local_io_timer / 1000;
351
18.9k
            stats_output->cloud_remote_read_time =
352
18.9k
                    reader.stats().file_cache_stats.remote_io_timer / 1000;
353
18.9k
        }
354
26.7k
    }
355
26.8k
    RETURN_IF_ERROR(dst_rowset_writer->flush_columns(is_key));
356
357
26.8k
    return Status::OK();
358
26.8k
}
359
360
// for segcompaction
361
Status Merger::vertical_compact_one_group(
362
        int64_t tablet_id, ReaderType reader_type, const TabletSchema& tablet_schema, bool is_key,
363
        const std::vector<uint32_t>& column_group, RowSourcesBuffer* row_source_buf,
364
        VerticalBlockReader& src_block_reader, segment_v2::SegmentWriter& dst_segment_writer,
365
        Statistics* stats_output, uint64_t* index_size, KeyBoundsPB& key_bounds,
366
22
        SimpleRowIdConversion* rowid_conversion) {
367
    // TODO: record_rowids
368
22
    Block block = tablet_schema.create_block(column_group);
369
22
    size_t output_rows = 0;
370
22
    bool eof = false;
371
138
    while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) {
372
        // Read one block from block reader
373
116
        RETURN_NOT_OK_STATUS_WITH_WARN(src_block_reader.next_block_with_aggregation(&block, &eof),
374
116
                                       "failed to read next block when merging rowsets of tablet " +
375
116
                                               std::to_string(tablet_id));
376
116
        if (!block.rows()) {
377
0
            break;
378
0
        }
379
116
        RETURN_NOT_OK_STATUS_WITH_WARN(dst_segment_writer.append_block(&block, 0, block.rows()),
380
116
                                       "failed to write block when merging rowsets of tablet " +
381
116
                                               std::to_string(tablet_id));
382
383
116
        if (is_key && rowid_conversion != nullptr) {
384
30
            rowid_conversion->add(src_block_reader.current_block_row_locations());
385
30
        }
386
116
        output_rows += block.rows();
387
116
        block.clear_column_data();
388
116
    }
389
22
    if (ExecEnv::GetInstance()->storage_engine().stopped()) {
390
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
391
0
                                             tablet_id);
392
0
    }
393
394
22
    if (stats_output != nullptr) {
395
22
        if (is_key) {
396
11
            stats_output->output_rows = output_rows;
397
11
            stats_output->merged_rows = src_block_reader.merged_rows();
398
11
            stats_output->filtered_rows = src_block_reader.filtered_rows();
399
11
        }
400
22
        stats_output->bytes_read_from_local =
401
22
                src_block_reader.stats().file_cache_stats.bytes_read_from_local;
402
22
        stats_output->bytes_read_from_remote =
403
22
                src_block_reader.stats().file_cache_stats.bytes_read_from_remote;
404
22
        stats_output->cached_bytes_total =
405
22
                src_block_reader.stats().file_cache_stats.bytes_write_into_cache;
406
22
    }
407
408
    // segcompaction produce only one segment at once
409
22
    RETURN_IF_ERROR(dst_segment_writer.finalize_columns_data());
410
22
    RETURN_IF_ERROR(dst_segment_writer.finalize_columns_index(index_size));
411
412
22
    if (is_key) {
413
11
        Slice min_key = dst_segment_writer.min_encoded_key();
414
11
        Slice max_key = dst_segment_writer.max_encoded_key();
415
11
        DCHECK_LE(min_key.compare(max_key), 0);
416
11
        key_bounds.set_min_key(min_key.to_string());
417
11
        key_bounds.set_max_key(max_key.to_string());
418
11
    }
419
420
22
    return Status::OK();
421
22
}
422
423
int64_t estimate_batch_size(int group_index, BaseTabletSPtr tablet, int64_t way_cnt,
424
                            ReaderType reader_type, int64_t group_per_row_from_footer,
425
26.5k
                            bool footer_fallback) {
426
26.5k
    auto& sample_info_lock = tablet->get_sample_info_lock(reader_type);
427
26.5k
    auto& sample_infos = tablet->get_sample_infos(reader_type);
428
26.5k
    std::unique_lock<std::mutex> lock(sample_info_lock);
429
26.5k
    CompactionSampleInfo info = sample_infos[group_index];
430
26.5k
    if (way_cnt <= 0) {
431
6.86k
        LOG(INFO) << "estimate batch size for vertical compaction, tablet id: "
432
6.86k
                  << tablet->tablet_id() << " way cnt: " << way_cnt;
433
6.86k
        return 4096 - 32;
434
6.86k
    }
435
19.7k
    int64_t block_mem_limit = config::compaction_memory_bytes_limit / way_cnt;
436
19.7k
    if (tablet->last_compaction_status.is<ErrorCode::MEM_LIMIT_EXCEEDED>()) {
437
0
        block_mem_limit /= 4;
438
0
    }
439
440
19.7k
    int64_t group_data_size = 0;
441
19.7k
    if (info.group_data_size > 0 && info.bytes > 0 && info.rows > 0) {
442
0
        double smoothing_factor = 0.5;
443
0
        group_data_size =
444
0
                int64_t((cast_set<double>(info.group_data_size) * (1 - smoothing_factor)) +
445
0
                        (cast_set<double>(info.bytes / info.rows) * smoothing_factor));
446
0
        sample_infos[group_index].group_data_size = group_data_size;
447
19.7k
    } else if (info.group_data_size > 0 && (info.bytes <= 0 || info.rows <= 0)) {
448
0
        group_data_size = info.group_data_size;
449
19.7k
    } else if (info.group_data_size <= 0 && info.bytes > 0 && info.rows > 0) {
450
12.1k
        group_data_size = info.bytes / info.rows;
451
12.1k
        sample_infos[group_index].group_data_size = group_data_size;
452
12.1k
    } else {
453
        // No historical sampling data available.
454
        // Try to use raw_data_bytes from segment footer for a better estimate.
455
7.53k
        if (!footer_fallback && group_per_row_from_footer > 0) {
456
7.12k
            int64_t batch_size = block_mem_limit / group_per_row_from_footer;
457
7.12k
            int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L));
458
7.12k
            LOG(INFO) << "estimate batch size from footer for vertical compaction, tablet id: "
459
7.12k
                      << tablet->tablet_id()
460
7.12k
                      << " group_per_row_from_footer: " << group_per_row_from_footer
461
7.12k
                      << " way cnt: " << way_cnt << " batch size: " << res;
462
7.12k
            return res;
463
7.12k
        }
464
7.53k
        LOG(INFO) << "estimate batch size for vertical compaction, tablet id: "
465
413
                  << tablet->tablet_id() << " group data size: " << info.group_data_size
466
413
                  << " row num: " << info.rows << " consume bytes: " << info.bytes
467
413
                  << " footer_fallback: " << footer_fallback;
468
413
        return 1024 - 32;
469
7.53k
    }
470
471
12.1k
    if (group_data_size <= 0) {
472
0
        LOG(WARNING) << "estimate batch size for vertical compaction, tablet id: "
473
0
                     << tablet->tablet_id() << " unexpected group data size: " << group_data_size;
474
0
        return 4096 - 32;
475
0
    }
476
477
12.1k
    sample_infos[group_index].bytes = 0;
478
12.1k
    sample_infos[group_index].rows = 0;
479
480
12.1k
    int64_t batch_size = block_mem_limit / group_data_size;
481
12.1k
    int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L));
482
12.1k
    LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " << tablet->tablet_id()
483
12.1k
              << " group data size: " << info.group_data_size << " row num: " << info.rows
484
12.1k
              << " consume bytes: " << info.bytes << " way cnt: " << way_cnt
485
12.1k
              << " batch size: " << res;
486
12.1k
    return res;
487
12.1k
}
488
489
// steps to do vertical merge:
490
// 1. split columns into column groups
491
// 2. compact groups one by one, generate a row_source_buf when compact key group
492
// and use this row_source_buf to compact value column groups
493
// 3. build output rowset
494
Status Merger::vertical_merge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type,
495
                                      const TabletSchema& tablet_schema,
496
                                      const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
497
                                      RowsetWriter* dst_rowset_writer,
498
                                      uint32_t max_rows_per_segment, int64_t merge_way_num,
499
                                      Statistics* stats_output,
500
10.1k
                                      VerticalCompactionProgressCallback progress_cb) {
501
10.1k
    LOG(INFO) << "Start to do vertical compaction, tablet_id: " << tablet->tablet_id();
502
10.1k
    std::vector<std::vector<uint32_t>> column_groups;
503
10.1k
    std::vector<uint32_t> key_group_cluster_key_idxes;
504
    // If BE config vertical_compaction_num_columns_per_group has been modified from
505
    // its default value (5), use the BE config; otherwise use the tablet meta value.
506
10.1k
    constexpr int32_t default_num_columns_per_group = 5;
507
10.1k
    int32_t num_columns_per_group =
508
10.1k
            config::vertical_compaction_num_columns_per_group != default_num_columns_per_group
509
10.1k
                    ? config::vertical_compaction_num_columns_per_group
510
10.1k
                    : tablet->tablet_meta()->vertical_compaction_num_columns_per_group();
511
512
10.1k
    DBUG_EXECUTE_IF("Merger.vertical_merge_rowsets.check_num_columns_per_group", {
513
10.1k
        auto expected_value = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
514
10.1k
                "Merger.vertical_merge_rowsets.check_num_columns_per_group", "expected_value", -1);
515
10.1k
        auto expected_tablet_id = DebugPoints::instance()->get_debug_param_or_default<int64_t>(
516
10.1k
                "Merger.vertical_merge_rowsets.check_num_columns_per_group", "tablet_id", -1);
517
10.1k
        if (expected_tablet_id != -1 && expected_tablet_id == tablet->tablet_id()) {
518
10.1k
            if (expected_value != -1 && expected_value != num_columns_per_group) {
519
10.1k
                LOG(FATAL) << "DEBUG_POINT CHECK FAILED: expected num_columns_per_group="
520
10.1k
                           << expected_value << " but got " << num_columns_per_group
521
10.1k
                           << " for tablet_id=" << tablet->tablet_id();
522
10.1k
            } else {
523
10.1k
                LOG(INFO) << "DEBUG_POINT CHECK PASSED: num_columns_per_group="
524
10.1k
                          << num_columns_per_group << ", tablet_id=" << tablet->tablet_id();
525
10.1k
            }
526
10.1k
        }
527
10.1k
    });
528
529
10.1k
    vertical_split_columns(tablet_schema, &column_groups, &key_group_cluster_key_idxes,
530
10.1k
                           num_columns_per_group);
531
532
10.1k
    if (progress_cb) {
533
10.1k
        progress_cb(column_groups.size(), 0);
534
10.1k
    }
535
536
    // Calculate total rows for density calculation after compaction
537
10.1k
    int64_t total_rows = 0;
538
83.5k
    for (const auto& rs_reader : src_rowset_readers) {
539
83.5k
        total_rows += rs_reader->rowset()->rowset_meta()->num_rows();
540
83.5k
    }
541
542
    // Use historical density for sparse wide table optimization
543
    // density = (total_cells - null_cells) / total_cells, smaller means more sparse
544
    // When density <= threshold, enable sparse optimization
545
    // threshold = 0 means disable, 1 means always enable (default)
546
10.1k
    bool enable_sparse_optimization = false;
547
10.1k
    if (config::sparse_column_compaction_threshold_percent > 0 &&
548
10.2k
        tablet->keys_type() == KeysType::UNIQUE_KEYS) {
549
5.79k
        double density = tablet->compaction_density.load();
550
5.79k
        enable_sparse_optimization = density <= config::sparse_column_compaction_threshold_percent;
551
552
5.79k
        LOG(INFO) << "Vertical compaction sparse optimization check: tablet_id="
553
5.79k
                  << tablet->tablet_id() << ", density=" << density
554
5.79k
                  << ", threshold=" << config::sparse_column_compaction_threshold_percent
555
5.79k
                  << ", total_rows=" << total_rows
556
5.79k
                  << ", num_columns=" << tablet_schema.num_columns()
557
5.79k
                  << ", total_cells=" << total_rows * tablet_schema.num_columns()
558
5.79k
                  << ", enable_sparse_optimization=" << enable_sparse_optimization;
559
5.79k
    }
560
561
10.1k
    RowSourcesBuffer row_sources_buf(tablet->tablet_id(), dst_rowset_writer->context().tablet_path,
562
10.1k
                                     reader_type);
563
10.1k
    Merger::Statistics total_stats;
564
10.2k
    if (stats_output != nullptr) {
565
10.2k
        total_stats.rowid_conversion = stats_output->rowid_conversion;
566
10.2k
    }
567
10.1k
    auto& sample_info_lock = tablet->get_sample_info_lock(reader_type);
568
10.1k
    auto& sample_infos = tablet->get_sample_infos(reader_type);
569
10.1k
    {
570
10.1k
        std::unique_lock<std::mutex> lock(sample_info_lock);
571
10.1k
        sample_infos.resize(column_groups.size());
572
10.1k
    }
573
    // Collect per-column raw_data_bytes from segment footer for first-time batch size estimation.
574
    // raw_data_bytes is the original data size before encoding, close to runtime Block::bytes().
575
    // Only collect when needed: skip if manual batch_size override is set, or if ALL groups
576
    // already have historical sampling data. Use per-group granularity so that schema evolution
577
    // (new groups without history) still gets footer-based estimation.
578
10.1k
    struct ColumnRawSizeInfo {
579
10.1k
        int64_t total_raw_bytes = 0;
580
10.1k
        int64_t rows_with_data = 0;
581
10.1k
    };
582
10.1k
    std::unordered_map<int32_t, ColumnRawSizeInfo> column_raw_sizes;
583
10.1k
    bool need_footer_collection = false;
584
10.1k
    if (config::compaction_batch_size == -1) {
585
10.1k
        std::unique_lock<std::mutex> lock(sample_info_lock);
586
19.2k
        for (const auto& info : sample_infos) {
587
19.2k
            if (info.group_data_size <= 0 && info.bytes <= 0 && info.rows <= 0) {
588
4.89k
                need_footer_collection = true;
589
4.89k
                break;
590
4.89k
            }
591
19.2k
        }
592
10.1k
    }
593
10.1k
    if (need_footer_collection) {
594
35.0k
        for (const auto& rs_reader : src_rowset_readers) {
595
35.0k
            auto beta_rowset = std::dynamic_pointer_cast<BetaRowset>(rs_reader->rowset());
596
35.0k
            if (!beta_rowset) {
597
0
                continue;
598
0
            }
599
35.0k
            std::vector<segment_v2::SegmentSharedPtr> segments;
600
35.0k
            auto st = beta_rowset->load_segments(&segments);
601
35.0k
            if (!st.ok()) {
602
0
                LOG(WARNING) << "Failed to load segments for footer raw_data_bytes collection"
603
0
                             << ", tablet_id: " << tablet->tablet_id()
604
0
                             << ", rowset_id: " << beta_rowset->rowset_id() << ", status: " << st;
605
0
                continue;
606
0
            }
607
35.0k
            for (const auto& segment : segments) {
608
13.6k
                int64_t row_count = segment->num_rows();
609
13.6k
                auto collect_st = segment->traverse_column_meta_pbs(
610
132k
                        [&](const segment_v2::ColumnMetaPB& meta) {
611
132k
                            int32_t uid = meta.unique_id();
612
132k
                            if (uid >= 0 && meta.has_raw_data_bytes()) {
613
125k
                                auto& info = column_raw_sizes[uid];
614
125k
                                info.total_raw_bytes += meta.raw_data_bytes();
615
125k
                                info.rows_with_data += row_count;
616
125k
                            }
617
132k
                        });
618
13.6k
                if (!collect_st.ok()) {
619
0
                    LOG(WARNING) << "Failed to traverse column meta for footer collection"
620
0
                                 << ", tablet_id: " << tablet->tablet_id()
621
0
                                 << ", status: " << collect_st;
622
0
                }
623
13.6k
            }
624
35.0k
        }
625
4.88k
    }
626
627
    // Pre-compute per-row estimate for each column group from footer data.
628
10.1k
    std::vector<int64_t> group_per_row_from_footer(column_groups.size(), 0);
629
10.1k
    std::vector<bool> group_footer_fallback(column_groups.size(), false);
630
36.9k
    for (size_t i = 0; i < column_groups.size(); ++i) {
631
26.7k
        int64_t group_per_row = 0;
632
26.7k
        bool need_fallback = false;
633
44.9k
        for (uint32_t col_ordinal : column_groups[i]) {
634
44.9k
            const auto& col = tablet_schema.column(col_ordinal);
635
44.9k
            int32_t uid = col.unique_id();
636
637
            // Variant columns (root or subcolumn): raw_data_bytes is 0 (TODO in writer),
638
            // cannot estimate from footer, fallback to default for the entire group.
639
44.9k
            if (uid < 0 || col.is_variant_type()) {
640
1.03k
                need_fallback = true;
641
1.03k
                break;
642
1.03k
            }
643
644
            // Any column without footer data (e.g. legacy segments written before
645
            // raw_data_bytes existed) makes the group sample partial and unreliable.
646
            // Fall back to the default for the whole group instead of summing only
647
            // the columns we measured.
648
43.9k
            auto it = column_raw_sizes.find(uid);
649
43.9k
            if (it == column_raw_sizes.end() || it->second.rows_with_data <= 0) {
650
18.6k
                need_fallback = true;
651
18.6k
                break;
652
18.6k
            }
653
654
25.3k
            int64_t raw_per_row = it->second.total_raw_bytes / it->second.rows_with_data;
655
25.3k
            int64_t col_per_row = 0;
656
657
25.3k
            if (col.type() == FieldType::OLAP_FIELD_TYPE_ARRAY ||
658
25.3k
                col.type() == FieldType::OLAP_FIELD_TYPE_MAP ||
659
25.3k
                col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
660
                // Complex types: raw_data_bytes recursively aggregates sub-writers.
661
1.32k
                col_per_row = raw_per_row;
662
24.0k
            } else if (col.is_length_variable_type()) {
663
                // Variable-length scalar (VARCHAR/STRING/HLL/BITMAP/...): raw_per_row
664
                // is the average char payload across all rows; reader still pays an
665
                // 8-byte offset entry per row regardless of null-ness.
666
9.01k
                col_per_row = raw_per_row + 8;
667
9.01k
                if (col.is_nullable()) {
668
4.72k
                    col_per_row += 1; // null map
669
4.72k
                }
670
15.0k
            } else {
671
                // Fixed-width scalar (INT/BIGINT/DOUBLE/DATE/...).
672
                // raw_data_bytes only counts non-null payload (append_nulls() does
673
                // not advance the page builder), but FileColumnIterator::next_batch
674
                // still calls ColumnNullable::insert_many_defaults() for null runs,
675
                // which grows the nested PODArray by N * type_size. So the runtime
676
                // per-row footprint is at least type_size, no matter how sparse.
677
15.0k
                int64_t type_size = field_type_size(col.type());
678
15.0k
                col_per_row = std::max(raw_per_row, type_size);
679
15.0k
                if (col.is_nullable()) {
680
8.66k
                    col_per_row += 1; // null map
681
8.66k
                }
682
15.0k
            }
683
684
25.3k
            group_per_row += col_per_row;
685
25.3k
        }
686
26.7k
        group_per_row_from_footer[i] = group_per_row;
687
26.7k
        group_footer_fallback[i] = need_fallback;
688
26.7k
    }
689
690
    // compact group one by one
691
36.9k
    for (auto i = 0; i < column_groups.size(); ++i) {
692
26.7k
        VLOG_NOTICE << "row source size: " << row_sources_buf.total_size();
693
26.7k
        bool is_key = (i == 0);
694
26.7k
        int64_t batch_size = config::compaction_batch_size != -1
695
26.7k
                                     ? config::compaction_batch_size
696
26.7k
                                     : estimate_batch_size(i, tablet, merge_way_num, reader_type,
697
26.6k
                                                           group_per_row_from_footer[i],
698
26.6k
                                                           group_footer_fallback[i]);
699
26.7k
        CompactionSampleInfo sample_info;
700
26.7k
        Merger::Statistics group_stats;
701
26.7k
        group_stats.rowid_conversion = total_stats.rowid_conversion;
702
18.4E
        Merger::Statistics* group_stats_ptr = stats_output != nullptr ? &group_stats : nullptr;
703
26.7k
        Status st = vertical_compact_one_group(
704
26.7k
                tablet, reader_type, tablet_schema, is_key, column_groups[i], &row_sources_buf,
705
26.7k
                src_rowset_readers, dst_rowset_writer, max_rows_per_segment, group_stats_ptr,
706
26.7k
                key_group_cluster_key_idxes, batch_size, &sample_info, enable_sparse_optimization);
707
26.7k
        {
708
26.7k
            std::unique_lock<std::mutex> lock(sample_info_lock);
709
26.7k
            sample_infos[i] = sample_info;
710
26.7k
        }
711
26.7k
        RETURN_IF_ERROR(st);
712
26.7k
        if (stats_output != nullptr) {
713
26.7k
            total_stats.bytes_read_from_local += group_stats.bytes_read_from_local;
714
26.7k
            total_stats.bytes_read_from_remote += group_stats.bytes_read_from_remote;
715
26.7k
            total_stats.cached_bytes_total += group_stats.cached_bytes_total;
716
26.7k
            total_stats.cloud_local_read_time += group_stats.cloud_local_read_time;
717
26.7k
            total_stats.cloud_remote_read_time += group_stats.cloud_remote_read_time;
718
26.7k
            if (is_key) {
719
10.1k
                total_stats.output_rows = group_stats.output_rows;
720
10.1k
                total_stats.merged_rows = group_stats.merged_rows;
721
10.1k
                total_stats.filtered_rows = group_stats.filtered_rows;
722
10.1k
                total_stats.rowid_conversion = group_stats.rowid_conversion;
723
10.1k
            }
724
26.7k
        }
725
26.7k
        if (progress_cb) {
726
26.4k
            progress_cb(column_groups.size(), i + 1);
727
26.4k
        }
728
26.7k
        if (is_key) {
729
10.2k
            RETURN_IF_ERROR(row_sources_buf.flush());
730
10.2k
        }
731
26.7k
        RETURN_IF_ERROR(row_sources_buf.seek_to_begin());
732
26.7k
    }
733
734
    // Calculate and store density for next compaction's sparse optimization threshold
735
    // density = (total_cells - total_null_count) / total_cells
736
    // Smaller density means more sparse
737
10.1k
    {
738
10.1k
        std::unique_lock<std::mutex> lock(sample_info_lock);
739
10.1k
        int64_t total_null_count = 0;
740
26.8k
        for (const auto& info : sample_infos) {
741
26.8k
            total_null_count += info.null_count;
742
26.8k
        }
743
10.1k
        int64_t total_cells = total_rows * tablet_schema.num_columns();
744
10.1k
        if (total_cells > 0) {
745
7.39k
            double density = static_cast<double>(total_cells - total_null_count) /
746
7.39k
                             static_cast<double>(total_cells);
747
7.39k
            tablet->compaction_density.store(density);
748
7.39k
            LOG(INFO) << "Vertical compaction density update: tablet_id=" << tablet->tablet_id()
749
7.39k
                      << ", total_cells=" << total_cells
750
7.39k
                      << ", total_null_count=" << total_null_count << ", density=" << density;
751
7.39k
        }
752
10.1k
    }
753
754
    // finish compact, build output rowset
755
10.1k
    VLOG_NOTICE << "finish compact groups";
756
10.1k
    RETURN_IF_ERROR(dst_rowset_writer->final_flush());
757
758
10.2k
    if (stats_output != nullptr) {
759
10.2k
        *stats_output = total_stats;
760
10.2k
    }
761
762
10.1k
    return Status::OK();
763
10.1k
}
764
} // namespace doris