Coverage Report

Created: 2026-04-14 12:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/merger.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/merger.h"
19
20
#include <gen_cpp/olap_file.pb.h>
21
#include <gen_cpp/types.pb.h>
22
#include <stddef.h>
23
#include <unistd.h>
24
25
#include <algorithm>
26
#include <iterator>
27
#include <memory>
28
#include <mutex>
29
#include <numeric>
30
#include <ostream>
31
#include <shared_mutex>
32
#include <string>
33
#include <utility>
34
#include <vector>
35
36
#include "cloud/config.h"
37
#include "common/config.h"
38
#include "common/logging.h"
39
#include "common/status.h"
40
#include "core/block/block.h"
41
#include "storage/iterator/block_reader.h"
42
#include "storage/iterator/vertical_block_reader.h"
43
#include "storage/iterator/vertical_merge_iterator.h"
44
#include "storage/iterators.h"
45
#include "storage/olap_common.h"
46
#include "storage/olap_define.h"
47
#include "storage/rowid_conversion.h"
48
#include "storage/rowset/rowset.h"
49
#include "storage/rowset/rowset_meta.h"
50
#include "storage/rowset/rowset_writer.h"
51
#include "storage/segment/segment_writer.h"
52
#include "storage/storage_engine.h"
53
#include "storage/tablet/base_tablet.h"
54
#include "storage/tablet/tablet.h"
55
#include "storage/tablet/tablet_fwd.h"
56
#include "storage/tablet/tablet_meta.h"
57
#include "storage/tablet/tablet_reader.h"
58
#include "storage/utils.h"
59
#include "util/slice.h"
60
61
namespace doris {
62
Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type,
63
                              const TabletSchema& cur_tablet_schema,
64
                              const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
65
1.43k
                              RowsetWriter* dst_rowset_writer, Statistics* stats_output) {
66
1.43k
    if (!cur_tablet_schema.cluster_key_uids().empty()) {
67
0
        return Status::InternalError(
68
0
                "mow table with cluster keys does not support non vertical compaction");
69
0
    }
70
1.43k
    BlockReader reader;
71
1.43k
    TabletReader::ReaderParams reader_params;
72
1.43k
    reader_params.tablet = tablet;
73
1.43k
    reader_params.reader_type = reader_type;
74
75
1.43k
    TabletReadSource read_source;
76
1.43k
    read_source.rs_splits.reserve(src_rowset_readers.size());
77
1.55k
    for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) {
78
1.55k
        read_source.rs_splits.emplace_back(rs_reader);
79
1.55k
    }
80
1.43k
    read_source.fill_delete_predicates();
81
1.43k
    reader_params.set_read_source(std::move(read_source));
82
83
1.43k
    reader_params.version = dst_rowset_writer->version();
84
85
1.43k
    TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>();
86
1.43k
    merge_tablet_schema->copy_from(cur_tablet_schema);
87
88
    // Merge the columns in delete predicate that not in latest schema in to current tablet schema
89
1.43k
    for (auto& del_pred_rs : reader_params.delete_predicates) {
90
24
        merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema());
91
24
    }
92
1.43k
    reader_params.tablet_schema = merge_tablet_schema;
93
1.43k
    if (!tablet->tablet_schema()->cluster_key_uids().empty()) {
94
0
        reader_params.delete_bitmap = tablet->tablet_meta()->delete_bitmap_ptr();
95
0
    }
96
97
1.43k
    if (stats_output && stats_output->rowid_conversion) {
98
48
        reader_params.record_rowids = true;
99
48
        reader_params.rowid_conversion = stats_output->rowid_conversion;
100
48
        stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id());
101
48
    }
102
103
1.43k
    reader_params.return_columns.resize(cur_tablet_schema.num_columns());
104
1.43k
    std::iota(reader_params.return_columns.begin(), reader_params.return_columns.end(), 0);
105
1.43k
    reader_params.origin_return_columns = &reader_params.return_columns;
106
1.43k
    RETURN_IF_ERROR(reader.init(reader_params));
107
108
1.43k
    Block block = cur_tablet_schema.create_block(reader_params.return_columns);
109
1.43k
    size_t output_rows = 0;
110
1.43k
    bool eof = false;
111
4.20k
    while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) {
112
2.76k
        auto tablet_state = tablet->tablet_state();
113
2.76k
        if (tablet_state != TABLET_RUNNING && tablet_state != TABLET_NOTREADY) {
114
0
            tablet->clear_cache();
115
0
            return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more",
116
0
                                                 tablet->tablet_id());
117
0
        }
118
119
        // Read one block from block reader
120
2.76k
        RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof),
121
2.76k
                                       "failed to read next block when merging rowsets of tablet " +
122
2.76k
                                               std::to_string(tablet->tablet_id()));
123
2.76k
        RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->add_block(&block),
124
2.76k
                                       "failed to write block when merging rowsets of tablet " +
125
2.76k
                                               std::to_string(tablet->tablet_id()));
126
127
2.76k
        if (reader_params.record_rowids && block.rows() > 0) {
128
578
            std::vector<uint32_t> segment_num_rows;
129
578
            RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows));
130
578
            stats_output->rowid_conversion->add(reader.current_block_row_locations(),
131
578
                                                segment_num_rows);
132
578
        }
133
134
2.76k
        output_rows += block.rows();
135
2.76k
        block.clear_column_data();
136
2.76k
    }
137
1.43k
    if (ExecEnv::GetInstance()->storage_engine().stopped()) {
138
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
139
0
                                             tablet->tablet_id());
140
0
    }
141
142
1.43k
    if (stats_output != nullptr) {
143
1.43k
        stats_output->output_rows = output_rows;
144
1.43k
        stats_output->merged_rows = reader.merged_rows();
145
1.43k
        stats_output->filtered_rows = reader.filtered_rows();
146
1.43k
        stats_output->bytes_read_from_local = reader.stats().file_cache_stats.bytes_read_from_local;
147
1.43k
        stats_output->bytes_read_from_remote =
148
1.43k
                reader.stats().file_cache_stats.bytes_read_from_remote;
149
1.43k
        stats_output->cached_bytes_total = reader.stats().file_cache_stats.bytes_write_into_cache;
150
1.43k
        if (config::is_cloud_mode()) {
151
1.38k
            stats_output->cloud_local_read_time =
152
1.38k
                    reader.stats().file_cache_stats.local_io_timer / 1000;
153
1.38k
            stats_output->cloud_remote_read_time =
154
1.38k
                    reader.stats().file_cache_stats.remote_io_timer / 1000;
155
1.38k
        }
156
1.43k
    }
157
158
1.43k
    RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->flush(),
159
1.43k
                                   "failed to flush rowset when merging rowsets of tablet " +
160
1.43k
                                           std::to_string(tablet->tablet_id()));
161
162
1.43k
    return Status::OK();
163
1.43k
}
164
165
// split columns into several groups, make sure all keys in one group
166
// unique_key should consider sequence&delete column
167
void Merger::vertical_split_columns(const TabletSchema& tablet_schema,
168
                                    std::vector<std::vector<uint32_t>>* column_groups,
169
                                    std::vector<uint32_t>* key_group_cluster_key_idxes,
170
8.33k
                                    int32_t num_columns_per_group) {
171
8.33k
    size_t num_key_cols = tablet_schema.num_key_columns();
172
8.33k
    size_t total_cols = tablet_schema.num_columns();
173
8.33k
    std::vector<uint32_t> key_columns;
174
43.3k
    for (auto i = 0; i < num_key_cols; ++i) {
175
35.0k
        key_columns.emplace_back(i);
176
35.0k
    }
177
    // in unique key, sequence & delete sign column should merge with key columns
178
8.33k
    int32_t sequence_col_idx = -1;
179
8.33k
    int32_t delete_sign_idx = -1;
180
    // in key column compaction, seq_col real index is _num_key_columns
181
    // and delete_sign column is _block->columns() - 1
182
8.33k
    if (tablet_schema.keys_type() == KeysType::UNIQUE_KEYS) {
183
4.10k
        if (tablet_schema.has_sequence_col()) {
184
94
            sequence_col_idx = tablet_schema.sequence_col_idx();
185
94
            key_columns.emplace_back(sequence_col_idx);
186
94
        }
187
4.10k
        delete_sign_idx = tablet_schema.field_index(DELETE_SIGN);
188
4.10k
        if (delete_sign_idx != -1) {
189
4.09k
            key_columns.emplace_back(delete_sign_idx);
190
4.09k
        }
191
4.10k
        if (!tablet_schema.cluster_key_uids().empty()) {
192
460
            for (const auto& cid : tablet_schema.cluster_key_uids()) {
193
460
                auto idx = tablet_schema.field_index(cid);
194
460
                DCHECK(idx >= 0) << "could not find cluster key column with unique_id=" << cid
195
0
                                 << " in tablet schema, table_id=" << tablet_schema.table_id();
196
460
                if (idx >= num_key_cols) {
197
217
                    key_columns.emplace_back(idx);
198
217
                }
199
460
            }
200
            // tablet schema unique ids: [1, 2, 5, 3, 6, 4], [1 2] is key columns
201
            // cluster key unique ids: [3, 1, 4]
202
            // the key_columns should be [0, 1, 3, 5]
203
            // the key_group_cluster_key_idxes should be [2, 1, 3]
204
461
            for (const auto& cid : tablet_schema.cluster_key_uids()) {
205
461
                auto idx = tablet_schema.field_index(cid);
206
2.68k
                for (auto i = 0; i < key_columns.size(); ++i) {
207
2.68k
                    if (idx == key_columns[i]) {
208
461
                        key_group_cluster_key_idxes->emplace_back(i);
209
461
                        break;
210
461
                    }
211
2.68k
                }
212
461
            }
213
155
        }
214
4.10k
    }
215
8.33k
    VLOG_NOTICE << "sequence_col_idx=" << sequence_col_idx
216
44
                << ", delete_sign_idx=" << delete_sign_idx;
217
    // for duplicate no keys
218
8.33k
    if (!key_columns.empty()) {
219
8.31k
        column_groups->emplace_back(key_columns);
220
8.31k
    }
221
222
8.33k
    std::vector<uint32_t> value_columns;
223
224
72.1k
    for (size_t i = num_key_cols; i < total_cols; ++i) {
225
63.7k
        if (i == sequence_col_idx || i == delete_sign_idx ||
226
63.7k
            key_columns.end() != std::find(key_columns.begin(), key_columns.end(), i)) {
227
4.39k
            continue;
228
4.39k
        }
229
230
59.3k
        if (!value_columns.empty() && value_columns.size() % num_columns_per_group == 0) {
231
7.15k
            column_groups->push_back(value_columns);
232
7.15k
            value_columns.clear();
233
7.15k
        }
234
59.3k
        value_columns.push_back(cast_set<uint32_t>(i));
235
59.3k
    }
236
237
8.33k
    if (!value_columns.empty()) {
238
7.71k
        column_groups->push_back(value_columns);
239
7.71k
    }
240
8.33k
}
241
242
Status Merger::vertical_compact_one_group(
243
        BaseTabletSPtr tablet, ReaderType reader_type, const TabletSchema& tablet_schema,
244
        bool is_key, const std::vector<uint32_t>& column_group, RowSourcesBuffer* row_source_buf,
245
        const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
246
        RowsetWriter* dst_rowset_writer, uint32_t max_rows_per_segment, Statistics* stats_output,
247
        std::vector<uint32_t> key_group_cluster_key_idxes, int64_t batch_size,
248
23.1k
        CompactionSampleInfo* sample_info, bool enable_sparse_optimization) {
249
    // build tablet reader
250
23.1k
    VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment;
251
23.1k
    VerticalBlockReader reader(row_source_buf);
252
23.1k
    TabletReader::ReaderParams reader_params;
253
23.1k
    reader_params.is_key_column_group = is_key;
254
23.1k
    reader_params.key_group_cluster_key_idxes = key_group_cluster_key_idxes;
255
23.1k
    reader_params.tablet = tablet;
256
23.1k
    reader_params.reader_type = reader_type;
257
23.1k
    reader_params.enable_sparse_optimization = enable_sparse_optimization;
258
259
23.1k
    TabletReadSource read_source;
260
23.1k
    read_source.rs_splits.reserve(src_rowset_readers.size());
261
177k
    for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) {
262
177k
        read_source.rs_splits.emplace_back(rs_reader);
263
177k
    }
264
23.1k
    read_source.fill_delete_predicates();
265
23.1k
    reader_params.set_read_source(std::move(read_source));
266
267
23.1k
    reader_params.version = dst_rowset_writer->version();
268
269
23.1k
    TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>();
270
23.1k
    merge_tablet_schema->copy_from(tablet_schema);
271
272
23.1k
    for (auto& del_pred_rs : reader_params.delete_predicates) {
273
1.72k
        merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema());
274
1.72k
    }
275
276
23.1k
    reader_params.tablet_schema = merge_tablet_schema;
277
23.1k
    bool has_cluster_key = false;
278
23.1k
    if (!tablet->tablet_schema()->cluster_key_uids().empty()) {
279
439
        reader_params.delete_bitmap = tablet->tablet_meta()->delete_bitmap_ptr();
280
439
        has_cluster_key = true;
281
439
    }
282
283
23.1k
    if (is_key && stats_output && stats_output->rowid_conversion) {
284
4.37k
        reader_params.record_rowids = true;
285
4.37k
        reader_params.rowid_conversion = stats_output->rowid_conversion;
286
4.37k
        stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id());
287
4.37k
    }
288
289
23.1k
    reader_params.return_columns = column_group;
290
23.1k
    reader_params.origin_return_columns = &reader_params.return_columns;
291
23.1k
    reader_params.batch_size = batch_size;
292
23.1k
    RETURN_IF_ERROR(reader.init(reader_params, sample_info));
293
294
23.1k
    Block block = tablet_schema.create_block(reader_params.return_columns);
295
23.1k
    size_t output_rows = 0;
296
23.1k
    bool eof = false;
297
74.4k
    while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) {
298
51.2k
        auto tablet_state = tablet->tablet_state();
299
51.2k
        if (tablet_state != TABLET_RUNNING && tablet_state != TABLET_NOTREADY) {
300
0
            tablet->clear_cache();
301
0
            return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more",
302
0
                                                 tablet->tablet_id());
303
0
        }
304
        // Read one block from block reader
305
51.2k
        RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof),
306
51.2k
                                       "failed to read next block when merging rowsets of tablet " +
307
51.2k
                                               std::to_string(tablet->tablet_id()));
308
51.2k
        RETURN_NOT_OK_STATUS_WITH_WARN(
309
51.2k
                dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment,
310
51.2k
                                               has_cluster_key),
311
51.2k
                "failed to write block when merging rowsets of tablet " +
312
51.2k
                        std::to_string(tablet->tablet_id()));
313
314
51.2k
        if (is_key && reader_params.record_rowids && block.rows() > 0) {
315
8.73k
            std::vector<uint32_t> segment_num_rows;
316
8.73k
            RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows));
317
8.73k
            stats_output->rowid_conversion->add(reader.current_block_row_locations(),
318
8.73k
                                                segment_num_rows);
319
8.73k
        }
320
51.2k
        output_rows += block.rows();
321
51.2k
        block.clear_column_data();
322
51.2k
    }
323
23.1k
    if (ExecEnv::GetInstance()->storage_engine().stopped()) {
324
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
325
0
                                             tablet->tablet_id());
326
0
    }
327
328
23.1k
    if (stats_output != nullptr) {
329
23.1k
        if (is_key) {
330
8.31k
            stats_output->output_rows = output_rows;
331
8.31k
            stats_output->merged_rows = reader.merged_rows();
332
8.31k
            stats_output->filtered_rows = reader.filtered_rows();
333
8.31k
        }
334
23.1k
        stats_output->bytes_read_from_local = reader.stats().file_cache_stats.bytes_read_from_local;
335
23.1k
        stats_output->bytes_read_from_remote =
336
23.1k
                reader.stats().file_cache_stats.bytes_read_from_remote;
337
23.1k
        stats_output->cached_bytes_total = reader.stats().file_cache_stats.bytes_write_into_cache;
338
23.1k
        if (config::is_cloud_mode()) {
339
20.2k
            stats_output->cloud_local_read_time =
340
20.2k
                    reader.stats().file_cache_stats.local_io_timer / 1000;
341
20.2k
            stats_output->cloud_remote_read_time =
342
20.2k
                    reader.stats().file_cache_stats.remote_io_timer / 1000;
343
20.2k
        }
344
23.1k
    }
345
23.1k
    RETURN_IF_ERROR(dst_rowset_writer->flush_columns(is_key));
346
347
23.1k
    return Status::OK();
348
23.1k
}
349
350
// for segcompaction
351
Status Merger::vertical_compact_one_group(
352
        int64_t tablet_id, ReaderType reader_type, const TabletSchema& tablet_schema, bool is_key,
353
        const std::vector<uint32_t>& column_group, RowSourcesBuffer* row_source_buf,
354
        VerticalBlockReader& src_block_reader, segment_v2::SegmentWriter& dst_segment_writer,
355
        Statistics* stats_output, uint64_t* index_size, KeyBoundsPB& key_bounds,
356
22
        SimpleRowIdConversion* rowid_conversion) {
357
    // TODO: record_rowids
358
22
    Block block = tablet_schema.create_block(column_group);
359
22
    size_t output_rows = 0;
360
22
    bool eof = false;
361
138
    while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) {
362
        // Read one block from block reader
363
116
        RETURN_NOT_OK_STATUS_WITH_WARN(src_block_reader.next_block_with_aggregation(&block, &eof),
364
116
                                       "failed to read next block when merging rowsets of tablet " +
365
116
                                               std::to_string(tablet_id));
366
116
        if (!block.rows()) {
367
0
            break;
368
0
        }
369
116
        RETURN_NOT_OK_STATUS_WITH_WARN(dst_segment_writer.append_block(&block, 0, block.rows()),
370
116
                                       "failed to write block when merging rowsets of tablet " +
371
116
                                               std::to_string(tablet_id));
372
373
116
        if (is_key && rowid_conversion != nullptr) {
374
30
            rowid_conversion->add(src_block_reader.current_block_row_locations());
375
30
        }
376
116
        output_rows += block.rows();
377
116
        block.clear_column_data();
378
116
    }
379
22
    if (ExecEnv::GetInstance()->storage_engine().stopped()) {
380
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
381
0
                                             tablet_id);
382
0
    }
383
384
22
    if (stats_output != nullptr) {
385
22
        if (is_key) {
386
11
            stats_output->output_rows = output_rows;
387
11
            stats_output->merged_rows = src_block_reader.merged_rows();
388
11
            stats_output->filtered_rows = src_block_reader.filtered_rows();
389
11
        }
390
22
        stats_output->bytes_read_from_local =
391
22
                src_block_reader.stats().file_cache_stats.bytes_read_from_local;
392
22
        stats_output->bytes_read_from_remote =
393
22
                src_block_reader.stats().file_cache_stats.bytes_read_from_remote;
394
22
        stats_output->cached_bytes_total =
395
22
                src_block_reader.stats().file_cache_stats.bytes_write_into_cache;
396
22
    }
397
398
    // segcompaction produce only one segment at once
399
22
    RETURN_IF_ERROR(dst_segment_writer.finalize_columns_data());
400
22
    RETURN_IF_ERROR(dst_segment_writer.finalize_columns_index(index_size));
401
402
22
    if (is_key) {
403
11
        Slice min_key = dst_segment_writer.min_encoded_key();
404
11
        Slice max_key = dst_segment_writer.max_encoded_key();
405
11
        DCHECK_LE(min_key.compare(max_key), 0);
406
11
        key_bounds.set_min_key(min_key.to_string());
407
11
        key_bounds.set_max_key(max_key.to_string());
408
11
    }
409
410
22
    return Status::OK();
411
22
}
412
413
int64_t estimate_batch_size(int group_index, BaseTabletSPtr tablet, int64_t way_cnt,
414
22.9k
                            ReaderType reader_type) {
415
22.9k
    auto& sample_info_lock = tablet->get_sample_info_lock(reader_type);
416
22.9k
    auto& sample_infos = tablet->get_sample_infos(reader_type);
417
22.9k
    std::unique_lock<std::mutex> lock(sample_info_lock);
418
22.9k
    CompactionSampleInfo info = sample_infos[group_index];
419
22.9k
    if (way_cnt <= 0) {
420
7.11k
        LOG(INFO) << "estimate batch size for vertical compaction, tablet id: "
421
7.11k
                  << tablet->tablet_id() << " way cnt: " << way_cnt;
422
7.11k
        return 4096 - 32;
423
7.11k
    }
424
15.8k
    int64_t block_mem_limit = config::compaction_memory_bytes_limit / way_cnt;
425
15.8k
    if (tablet->last_compaction_status.is<ErrorCode::MEM_LIMIT_EXCEEDED>()) {
426
0
        block_mem_limit /= 4;
427
0
    }
428
429
15.8k
    int64_t group_data_size = 0;
430
15.8k
    if (info.group_data_size > 0 && info.bytes > 0 && info.rows > 0) {
431
0
        double smoothing_factor = 0.5;
432
0
        group_data_size =
433
0
                int64_t((cast_set<double>(info.group_data_size) * (1 - smoothing_factor)) +
434
0
                        (cast_set<double>(info.bytes / info.rows) * smoothing_factor));
435
0
        sample_infos[group_index].group_data_size = group_data_size;
436
15.8k
    } else if (info.group_data_size > 0 && (info.bytes <= 0 || info.rows <= 0)) {
437
0
        group_data_size = info.group_data_size;
438
15.8k
    } else if (info.group_data_size <= 0 && info.bytes > 0 && info.rows > 0) {
439
8.63k
        group_data_size = info.bytes / info.rows;
440
8.63k
        sample_infos[group_index].group_data_size = group_data_size;
441
8.63k
    } else {
442
7.22k
        LOG(INFO) << "estimate batch size for vertical compaction, tablet id: "
443
7.22k
                  << tablet->tablet_id() << " group data size: " << info.group_data_size
444
7.22k
                  << " row num: " << info.rows << " consume bytes: " << info.bytes;
445
7.22k
        return 1024 - 32;
446
7.22k
    }
447
448
8.63k
    if (group_data_size <= 0) {
449
0
        LOG(WARNING) << "estimate batch size for vertical compaction, tablet id: "
450
0
                     << tablet->tablet_id() << " unexpected group data size: " << group_data_size;
451
0
        return 4096 - 32;
452
0
    }
453
454
8.63k
    sample_infos[group_index].bytes = 0;
455
8.63k
    sample_infos[group_index].rows = 0;
456
457
8.63k
    int64_t batch_size = block_mem_limit / group_data_size;
458
8.63k
    int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L));
459
8.63k
    LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " << tablet->tablet_id()
460
8.63k
              << " group data size: " << info.group_data_size << " row num: " << info.rows
461
8.63k
              << " consume bytes: " << info.bytes << " way cnt: " << way_cnt
462
8.63k
              << " batch size: " << res;
463
8.63k
    return res;
464
8.63k
}
465
466
// steps to do vertical merge:
467
// 1. split columns into column groups
468
// 2. compact groups one by one, generate a row_source_buf when compact key group
469
// and use this row_source_buf to compact value column groups
470
// 3. build output rowset
471
Status Merger::vertical_merge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type,
472
                                      const TabletSchema& tablet_schema,
473
                                      const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
474
                                      RowsetWriter* dst_rowset_writer,
475
                                      uint32_t max_rows_per_segment, int64_t merge_way_num,
476
                                      Statistics* stats_output,
477
8.30k
                                      VerticalCompactionProgressCallback progress_cb) {
478
8.30k
    LOG(INFO) << "Start to do vertical compaction, tablet_id: " << tablet->tablet_id();
479
8.30k
    std::vector<std::vector<uint32_t>> column_groups;
480
8.30k
    std::vector<uint32_t> key_group_cluster_key_idxes;
481
    // If BE config vertical_compaction_num_columns_per_group has been modified from
482
    // its default value (5), use the BE config; otherwise use the tablet meta value.
483
8.30k
    constexpr int32_t default_num_columns_per_group = 5;
484
8.30k
    int32_t num_columns_per_group =
485
8.30k
            config::vertical_compaction_num_columns_per_group != default_num_columns_per_group
486
8.30k
                    ? config::vertical_compaction_num_columns_per_group
487
8.30k
                    : tablet->tablet_meta()->vertical_compaction_num_columns_per_group();
488
489
8.30k
    DBUG_EXECUTE_IF("Merger.vertical_merge_rowsets.check_num_columns_per_group", {
490
8.30k
        auto expected_value = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
491
8.30k
                "Merger.vertical_merge_rowsets.check_num_columns_per_group", "expected_value", -1);
492
8.30k
        auto expected_tablet_id = DebugPoints::instance()->get_debug_param_or_default<int64_t>(
493
8.30k
                "Merger.vertical_merge_rowsets.check_num_columns_per_group", "tablet_id", -1);
494
8.30k
        if (expected_tablet_id != -1 && expected_tablet_id == tablet->tablet_id()) {
495
8.30k
            if (expected_value != -1 && expected_value != num_columns_per_group) {
496
8.30k
                LOG(FATAL) << "DEBUG_POINT CHECK FAILED: expected num_columns_per_group="
497
8.30k
                           << expected_value << " but got " << num_columns_per_group
498
8.30k
                           << " for tablet_id=" << tablet->tablet_id();
499
8.30k
            } else {
500
8.30k
                LOG(INFO) << "DEBUG_POINT CHECK PASSED: num_columns_per_group="
501
8.30k
                          << num_columns_per_group << ", tablet_id=" << tablet->tablet_id();
502
8.30k
            }
503
8.30k
        }
504
8.30k
    });
505
506
8.30k
    vertical_split_columns(tablet_schema, &column_groups, &key_group_cluster_key_idxes,
507
8.30k
                           num_columns_per_group);
508
509
8.30k
    if (progress_cb) {
510
8.21k
        progress_cb(column_groups.size(), 0);
511
8.21k
    }
512
513
    // Calculate total rows for density calculation after compaction
514
8.30k
    int64_t total_rows = 0;
515
63.2k
    for (const auto& rs_reader : src_rowset_readers) {
516
63.2k
        total_rows += rs_reader->rowset()->rowset_meta()->num_rows();
517
63.2k
    }
518
519
    // Use historical density for sparse wide table optimization
520
    // density = (total_cells - null_cells) / total_cells, smaller means more sparse
521
    // When density <= threshold, enable sparse optimization
522
    // threshold = 0 means disable, 1 means always enable (default)
523
8.30k
    bool enable_sparse_optimization = false;
524
8.30k
    if (config::sparse_column_compaction_threshold_percent > 0 &&
525
8.32k
        tablet->keys_type() == KeysType::UNIQUE_KEYS) {
526
4.09k
        double density = tablet->compaction_density.load();
527
4.09k
        enable_sparse_optimization = density <= config::sparse_column_compaction_threshold_percent;
528
529
4.09k
        LOG(INFO) << "Vertical compaction sparse optimization check: tablet_id="
530
4.09k
                  << tablet->tablet_id() << ", density=" << density
531
4.09k
                  << ", threshold=" << config::sparse_column_compaction_threshold_percent
532
4.09k
                  << ", total_rows=" << total_rows
533
4.09k
                  << ", num_columns=" << tablet_schema.num_columns()
534
4.09k
                  << ", total_cells=" << total_rows * tablet_schema.num_columns()
535
4.09k
                  << ", enable_sparse_optimization=" << enable_sparse_optimization;
536
4.09k
    }
537
538
8.30k
    RowSourcesBuffer row_sources_buf(tablet->tablet_id(), dst_rowset_writer->context().tablet_path,
539
8.30k
                                     reader_type);
540
8.30k
    Merger::Statistics total_stats;
541
8.32k
    if (stats_output != nullptr) {
542
8.32k
        total_stats.rowid_conversion = stats_output->rowid_conversion;
543
8.32k
    }
544
8.30k
    auto& sample_info_lock = tablet->get_sample_info_lock(reader_type);
545
8.30k
    auto& sample_infos = tablet->get_sample_infos(reader_type);
546
8.30k
    {
547
8.30k
        std::unique_lock<std::mutex> lock(sample_info_lock);
548
8.30k
        sample_infos.resize(column_groups.size());
549
8.30k
    }
550
    // compact group one by one
551
31.4k
    for (auto i = 0; i < column_groups.size(); ++i) {
552
23.1k
        VLOG_NOTICE << "row source size: " << row_sources_buf.total_size();
553
23.1k
        bool is_key = (i == 0);
554
23.1k
        int64_t batch_size = config::compaction_batch_size != -1
555
23.1k
                                     ? config::compaction_batch_size
556
23.1k
                                     : estimate_batch_size(i, tablet, merge_way_num, reader_type);
557
23.1k
        CompactionSampleInfo sample_info;
558
23.1k
        Merger::Statistics group_stats;
559
23.1k
        group_stats.rowid_conversion = total_stats.rowid_conversion;
560
18.4E
        Merger::Statistics* group_stats_ptr = stats_output != nullptr ? &group_stats : nullptr;
561
23.1k
        Status st = vertical_compact_one_group(
562
23.1k
                tablet, reader_type, tablet_schema, is_key, column_groups[i], &row_sources_buf,
563
23.1k
                src_rowset_readers, dst_rowset_writer, max_rows_per_segment, group_stats_ptr,
564
23.1k
                key_group_cluster_key_idxes, batch_size, &sample_info, enable_sparse_optimization);
565
23.1k
        {
566
23.1k
            std::unique_lock<std::mutex> lock(sample_info_lock);
567
23.1k
            sample_infos[i] = sample_info;
568
23.1k
        }
569
23.1k
        RETURN_IF_ERROR(st);
570
23.1k
        if (stats_output != nullptr) {
571
23.1k
            total_stats.bytes_read_from_local += group_stats.bytes_read_from_local;
572
23.1k
            total_stats.bytes_read_from_remote += group_stats.bytes_read_from_remote;
573
23.1k
            total_stats.cached_bytes_total += group_stats.cached_bytes_total;
574
23.1k
            total_stats.cloud_local_read_time += group_stats.cloud_local_read_time;
575
23.1k
            total_stats.cloud_remote_read_time += group_stats.cloud_remote_read_time;
576
23.1k
            if (is_key) {
577
8.31k
                total_stats.output_rows = group_stats.output_rows;
578
8.31k
                total_stats.merged_rows = group_stats.merged_rows;
579
8.31k
                total_stats.filtered_rows = group_stats.filtered_rows;
580
8.31k
                total_stats.rowid_conversion = group_stats.rowid_conversion;
581
8.31k
            }
582
23.1k
        }
583
23.1k
        if (progress_cb) {
584
22.8k
            progress_cb(column_groups.size(), i + 1);
585
22.8k
        }
586
23.1k
        if (is_key) {
587
8.32k
            RETURN_IF_ERROR(row_sources_buf.flush());
588
8.32k
        }
589
23.1k
        RETURN_IF_ERROR(row_sources_buf.seek_to_begin());
590
23.1k
    }
591
592
    // Calculate and store density for next compaction's sparse optimization threshold
593
    // density = (total_cells - total_null_count) / total_cells
594
    // Smaller density means more sparse
595
8.30k
    {
596
8.30k
        std::unique_lock<std::mutex> lock(sample_info_lock);
597
8.30k
        int64_t total_null_count = 0;
598
23.1k
        for (const auto& info : sample_infos) {
599
23.1k
            total_null_count += info.null_count;
600
23.1k
        }
601
8.30k
        int64_t total_cells = total_rows * tablet_schema.num_columns();
602
8.30k
        if (total_cells > 0) {
603
5.38k
            double density = static_cast<double>(total_cells - total_null_count) /
604
5.38k
                             static_cast<double>(total_cells);
605
5.38k
            tablet->compaction_density.store(density);
606
5.38k
            LOG(INFO) << "Vertical compaction density update: tablet_id=" << tablet->tablet_id()
607
5.38k
                      << ", total_cells=" << total_cells
608
5.38k
                      << ", total_null_count=" << total_null_count << ", density=" << density;
609
5.38k
        }
610
8.30k
    }
611
612
    // finish compact, build output rowset
613
8.30k
    VLOG_NOTICE << "finish compact groups";
614
8.30k
    RETURN_IF_ERROR(dst_rowset_writer->final_flush());
615
616
8.31k
    if (stats_output != nullptr) {
617
8.31k
        *stats_output = total_stats;
618
8.31k
    }
619
620
8.30k
    return Status::OK();
621
8.30k
}
622
} // namespace doris