Coverage Report

Created: 2025-04-21 11:57

/root/doris/be/src/olap/merger.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/merger.h"
19
20
#include <gen_cpp/olap_file.pb.h>
21
#include <gen_cpp/types.pb.h>
22
#include <stddef.h>
23
24
#include <algorithm>
25
#include <iterator>
26
#include <memory>
27
#include <mutex>
28
#include <numeric>
29
#include <ostream>
30
#include <shared_mutex>
31
#include <string>
32
#include <utility>
33
#include <vector>
34
35
#include "common/config.h"
36
#include "common/logging.h"
37
#include "common/status.h"
38
#include "olap/iterators.h"
39
#include "olap/olap_common.h"
40
#include "olap/olap_define.h"
41
#include "olap/rowid_conversion.h"
42
#include "olap/rowset/rowset.h"
43
#include "olap/rowset/rowset_meta.h"
44
#include "olap/rowset/rowset_writer.h"
45
#include "olap/rowset/segment_v2/segment_writer.h"
46
#include "olap/storage_engine.h"
47
#include "olap/tablet.h"
48
#include "olap/tablet_fwd.h"
49
#include "olap/tablet_reader.h"
50
#include "olap/utils.h"
51
#include "util/slice.h"
52
#include "vec/core/block.h"
53
#include "vec/olap/block_reader.h"
54
#include "vec/olap/vertical_block_reader.h"
55
#include "vec/olap/vertical_merge_iterator.h"
56
57
namespace doris {
58
59
Status Merger::vmerge_rowsets(TabletSharedPtr tablet, ReaderType reader_type,
60
                              TabletSchemaSPtr cur_tablet_schema,
61
                              const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
62
48
                              RowsetWriter* dst_rowset_writer, Statistics* stats_output) {
63
48
    vectorized::BlockReader reader;
64
48
    TabletReader::ReaderParams reader_params;
65
48
    reader_params.tablet = tablet;
66
48
    reader_params.reader_type = reader_type;
67
68
48
    TabletReader::ReadSource read_source;
69
48
    read_source.rs_splits.reserve(src_rowset_readers.size());
70
144
    for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) {
71
144
        read_source.rs_splits.emplace_back(RowSetSplits(rs_reader));
72
144
    }
73
48
    read_source.fill_delete_predicates();
74
48
    reader_params.set_read_source(std::move(read_source));
75
76
48
    reader_params.version = dst_rowset_writer->version();
77
78
48
    TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>();
79
48
    merge_tablet_schema->copy_from(*cur_tablet_schema);
80
81
    // Merge the columns in delete predicate that not in latest schema in to current tablet schema
82
48
    for (auto& del_pred_rs : reader_params.delete_predicates) {
83
24
        merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema());
84
24
    }
85
48
    reader_params.tablet_schema = merge_tablet_schema;
86
48
    if (!tablet->tablet_schema()->cluster_key_idxes().empty()) {
87
0
        reader_params.delete_bitmap = &tablet->tablet_meta()->delete_bitmap();
88
0
    }
89
90
48
    if (stats_output && stats_output->rowid_conversion) {
91
48
        reader_params.record_rowids = true;
92
48
        reader_params.rowid_conversion = stats_output->rowid_conversion;
93
48
        stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id());
94
48
    }
95
96
48
    reader_params.return_columns.resize(cur_tablet_schema->num_columns());
97
48
    std::iota(reader_params.return_columns.begin(), reader_params.return_columns.end(), 0);
98
48
    reader_params.origin_return_columns = &reader_params.return_columns;
99
48
    RETURN_IF_ERROR(reader.init(reader_params));
100
101
48
    vectorized::Block block = cur_tablet_schema->create_block(reader_params.return_columns);
102
48
    size_t output_rows = 0;
103
48
    bool eof = false;
104
626
    while (!eof && !StorageEngine::instance()->stopped()) {
105
578
        if (tablet->tablet_state() == TABLET_SHUTDOWN) {
106
0
            return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more",
107
0
                                                 tablet->tablet_id());
108
0
        }
109
        // Read one block from block reader
110
578
        RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof),
111
578
                                       "failed to read next block when merging rowsets of tablet " +
112
578
                                               std::to_string(tablet->tablet_id()));
113
578
        RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->add_block(&block),
114
578
                                       "failed to write block when merging rowsets of tablet " +
115
578
                                               std::to_string(tablet->tablet_id()));
116
117
578
        if (reader_params.record_rowids && block.rows() > 0) {
118
578
            std::vector<uint32_t> segment_num_rows;
119
578
            RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows));
120
578
            stats_output->rowid_conversion->add(reader.current_block_row_locations(),
121
578
                                                segment_num_rows);
122
578
        }
123
124
578
        output_rows += block.rows();
125
578
        block.clear_column_data();
126
578
    }
127
48
    if (StorageEngine::instance()->stopped()) {
128
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
129
0
                                             tablet->tablet_id());
130
0
    }
131
132
48
    if (stats_output != nullptr) {
133
48
        stats_output->output_rows = output_rows;
134
48
        stats_output->merged_rows = reader.merged_rows();
135
48
        stats_output->filtered_rows = reader.filtered_rows();
136
48
    }
137
138
48
    RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->flush(),
139
48
                                   "failed to flush rowset when merging rowsets of tablet " +
140
48
                                           std::to_string(tablet->tablet_id()));
141
142
48
    return Status::OK();
143
48
}
144
145
// split columns into several groups, make sure all keys in one group
146
// unique_key should consider sequence&delete column
147
void Merger::vertical_split_columns(TabletSchemaSPtr tablet_schema,
148
59
                                    std::vector<std::vector<uint32_t>>* column_groups) {
149
59
    uint32_t num_key_cols = tablet_schema->num_key_columns();
150
59
    uint32_t total_cols = tablet_schema->num_columns();
151
59
    std::vector<uint32_t> key_columns;
152
125
    for (auto i = 0; i < num_key_cols; ++i) {
153
66
        key_columns.emplace_back(i);
154
66
    }
155
    // in unique key, sequence & delete sign column should merge with key columns
156
59
    int32_t sequence_col_idx = -1;
157
59
    int32_t delete_sign_idx = -1;
158
    // in key column compaction, seq_col real index is _num_key_columns
159
    // and delete_sign column is _block->columns() - 1
160
59
    if (tablet_schema->keys_type() == KeysType::UNIQUE_KEYS) {
161
39
        if (tablet_schema->has_sequence_col()) {
162
4
            sequence_col_idx = tablet_schema->sequence_col_idx();
163
4
            key_columns.emplace_back(sequence_col_idx);
164
4
        }
165
39
        delete_sign_idx = tablet_schema->field_index(DELETE_SIGN);
166
39
        if (delete_sign_idx != -1) {
167
33
            key_columns.emplace_back(delete_sign_idx);
168
33
        }
169
39
        if (!tablet_schema->cluster_key_idxes().empty()) {
170
0
            for (const auto& cid : tablet_schema->cluster_key_idxes()) {
171
0
                if (cid >= num_key_cols) {
172
0
                    key_columns.emplace_back(cid);
173
0
                }
174
0
            }
175
0
        }
176
39
    }
177
59
    VLOG_NOTICE << "sequence_col_idx=" << sequence_col_idx
178
0
                << ", delete_sign_idx=" << delete_sign_idx;
179
    // for duplicate no keys
180
59
    if (!key_columns.empty()) {
181
55
        column_groups->emplace_back(std::move(key_columns));
182
55
    }
183
59
    auto&& cluster_key_idxes = tablet_schema->cluster_key_idxes();
184
59
    std::vector<uint32_t> value_columns;
185
163
    for (uint32_t i = num_key_cols; i < total_cols; ++i) {
186
104
        if (i == sequence_col_idx || i == delete_sign_idx ||
187
104
            cluster_key_idxes.end() !=
188
67
                    std::find(cluster_key_idxes.begin(), cluster_key_idxes.end(), i)) {
189
37
            continue;
190
37
        }
191
192
67
        if (!value_columns.empty() &&
193
67
            value_columns.size() % config::vertical_compaction_num_columns_per_group == 0) {
194
0
            column_groups->push_back(value_columns);
195
0
            value_columns.clear();
196
0
        }
197
67
        value_columns.push_back(i);
198
67
    }
199
200
59
    if (!value_columns.empty()) {
201
59
        column_groups->push_back(value_columns);
202
59
    }
203
59
}
204
205
Status Merger::vertical_compact_one_group(
206
        TabletSharedPtr tablet, ReaderType reader_type, TabletSchemaSPtr tablet_schema, bool is_key,
207
        const std::vector<uint32_t>& column_group, vectorized::RowSourcesBuffer* row_source_buf,
208
        const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
209
        RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, Statistics* stats_output,
210
        std::vector<uint32_t> key_group_cluster_key_idxes, int64_t batch_size,
211
92
        CompactionSampleInfo* sample_info) {
212
    // build tablet reader
213
92
    VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment;
214
92
    vectorized::VerticalBlockReader reader(row_source_buf);
215
92
    TabletReader::ReaderParams reader_params;
216
92
    reader_params.is_key_column_group = is_key;
217
92
    reader_params.key_group_cluster_key_idxes = key_group_cluster_key_idxes;
218
92
    reader_params.tablet = tablet;
219
92
    reader_params.reader_type = reader_type;
220
221
92
    TabletReader::ReadSource read_source;
222
92
    read_source.rs_splits.reserve(src_rowset_readers.size());
223
277
    for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) {
224
277
        read_source.rs_splits.emplace_back(RowSetSplits(rs_reader));
225
277
    }
226
92
    read_source.fill_delete_predicates();
227
92
    reader_params.set_read_source(std::move(read_source));
228
229
92
    reader_params.version = dst_rowset_writer->version();
230
231
92
    TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>();
232
92
    merge_tablet_schema->copy_from(*tablet_schema);
233
234
92
    for (auto& del_pred_rs : reader_params.delete_predicates) {
235
53
        merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema());
236
53
    }
237
238
92
    reader_params.tablet_schema = merge_tablet_schema;
239
92
    if (!tablet->tablet_schema()->cluster_key_idxes().empty()) {
240
0
        reader_params.delete_bitmap = &tablet->tablet_meta()->delete_bitmap();
241
0
    }
242
243
92
    if (is_key && stats_output && stats_output->rowid_conversion) {
244
48
        reader_params.record_rowids = true;
245
48
        reader_params.rowid_conversion = stats_output->rowid_conversion;
246
48
        stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id());
247
48
    }
248
249
92
    reader_params.return_columns = column_group;
250
92
    reader_params.origin_return_columns = &reader_params.return_columns;
251
92
    reader_params.batch_size = batch_size;
252
92
    RETURN_IF_ERROR(reader.init(reader_params, sample_info));
253
254
92
    vectorized::Block block = tablet_schema->create_block(reader_params.return_columns);
255
92
    size_t output_rows = 0;
256
92
    bool eof = false;
257
7.58k
    while (!eof && !StorageEngine::instance()->stopped()) {
258
        // Read one block from block reader
259
7.49k
        RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof),
260
7.49k
                                       "failed to read next block when merging rowsets of tablet " +
261
7.49k
                                               std::to_string(tablet->tablet_id()));
262
7.49k
        RETURN_NOT_OK_STATUS_WITH_WARN(
263
7.49k
                dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment),
264
7.49k
                "failed to write block when merging rowsets of tablet " +
265
7.49k
                        std::to_string(tablet->tablet_id()));
266
267
7.49k
        if (is_key && reader_params.record_rowids && block.rows() > 0) {
268
4.57k
            std::vector<uint32_t> segment_num_rows;
269
4.57k
            RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows));
270
4.57k
            stats_output->rowid_conversion->add(reader.current_block_row_locations(),
271
4.57k
                                                segment_num_rows);
272
4.57k
        }
273
7.49k
        output_rows += block.rows();
274
7.49k
        block.clear_column_data();
275
7.49k
    }
276
92
    if (StorageEngine::instance()->stopped()) {
277
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
278
0
                                             tablet->tablet_id());
279
0
    }
280
281
92
    if (is_key && stats_output != nullptr) {
282
48
        stats_output->output_rows = output_rows;
283
48
        stats_output->merged_rows = reader.merged_rows();
284
48
        stats_output->filtered_rows = reader.filtered_rows();
285
48
    }
286
92
    RETURN_IF_ERROR(dst_rowset_writer->flush_columns(is_key));
287
288
92
    return Status::OK();
289
92
}
290
291
// for segcompaction
292
Status Merger::vertical_compact_one_group(
293
        TabletSharedPtr tablet, ReaderType reader_type, TabletSchemaSPtr tablet_schema, bool is_key,
294
        const std::vector<uint32_t>& column_group, vectorized::RowSourcesBuffer* row_source_buf,
295
        vectorized::VerticalBlockReader& src_block_reader,
296
        segment_v2::SegmentWriter& dst_segment_writer, Statistics* stats_output,
297
22
        uint64_t* index_size, KeyBoundsPB& key_bounds, SimpleRowIdConversion* rowid_conversion) {
298
    // TODO: record_rowids
299
22
    vectorized::Block block = tablet_schema->create_block(column_group);
300
22
    size_t output_rows = 0;
301
22
    bool eof = false;
302
138
    while (!eof && !StorageEngine::instance()->stopped()) {
303
        // Read one block from block reader
304
116
        RETURN_NOT_OK_STATUS_WITH_WARN(src_block_reader.next_block_with_aggregation(&block, &eof),
305
116
                                       "failed to read next block when merging rowsets of tablet " +
306
116
                                               std::to_string(tablet->tablet_id()));
307
116
        if (!block.rows()) {
308
0
            break;
309
0
        }
310
116
        RETURN_NOT_OK_STATUS_WITH_WARN(dst_segment_writer.append_block(&block, 0, block.rows()),
311
116
                                       "failed to write block when merging rowsets of tablet " +
312
116
                                               std::to_string(tablet->tablet_id()));
313
314
116
        if (is_key && rowid_conversion != nullptr) {
315
30
            rowid_conversion->add(src_block_reader.current_block_row_locations());
316
30
        }
317
116
        output_rows += block.rows();
318
116
        block.clear_column_data();
319
116
    }
320
22
    if (StorageEngine::instance()->stopped()) {
321
0
        return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped",
322
0
                                             tablet->tablet_id());
323
0
    }
324
325
22
    if (is_key && stats_output != nullptr) {
326
11
        stats_output->output_rows = output_rows;
327
11
        stats_output->merged_rows = src_block_reader.merged_rows();
328
11
        stats_output->filtered_rows = src_block_reader.filtered_rows();
329
11
    }
330
331
    // segcompaction produce only one segment at once
332
22
    RETURN_IF_ERROR(dst_segment_writer.finalize_columns_data());
333
22
    RETURN_IF_ERROR(dst_segment_writer.finalize_columns_index(index_size));
334
335
22
    if (is_key) {
336
11
        Slice min_key = dst_segment_writer.min_encoded_key();
337
11
        Slice max_key = dst_segment_writer.max_encoded_key();
338
11
        DCHECK_LE(min_key.compare(max_key), 0);
339
11
        key_bounds.set_min_key(min_key.to_string());
340
11
        key_bounds.set_max_key(max_key.to_string());
341
11
    }
342
343
22
    return Status::OK();
344
22
}
345
346
92
int64_t estimate_batch_size(int group_index, BaseTabletSPtr tablet, int64_t way_cnt) {
347
92
    std::unique_lock<std::mutex> lock(tablet->sample_info_lock);
348
92
    CompactionSampleInfo info = tablet->sample_infos[group_index];
349
92
    if (way_cnt <= 0) {
350
0
        LOG(INFO) << "estimate batch size for vertical compaction, tablet id: "
351
0
                  << tablet->tablet_id() << " way cnt: " << way_cnt;
352
0
        return 4096 - 32;
353
0
    }
354
92
    int64_t block_mem_limit = config::compaction_memory_bytes_limit / way_cnt;
355
92
    if (tablet->last_compaction_status.is<ErrorCode::MEM_LIMIT_EXCEEDED>()) {
356
0
        block_mem_limit /= 4;
357
0
    }
358
359
92
    int64_t group_data_size = 0;
360
92
    if (info.group_data_size > 0 && info.bytes > 0 && info.rows > 0) {
361
0
        float smoothing_factor = 0.5;
362
0
        group_data_size = int64_t(info.group_data_size * (1 - smoothing_factor) +
363
0
                                  info.bytes / info.rows * smoothing_factor);
364
0
        tablet->sample_infos[group_index].group_data_size = group_data_size;
365
92
    } else if (info.group_data_size > 0 && (info.bytes <= 0 || info.rows <= 0)) {
366
0
        group_data_size = info.group_data_size;
367
92
    } else if (info.group_data_size <= 0 && info.bytes > 0 && info.rows > 0) {
368
1
        group_data_size = info.bytes / info.rows;
369
1
        tablet->sample_infos[group_index].group_data_size = group_data_size;
370
91
    } else {
371
91
        LOG(INFO) << "estimate batch size for vertical compaction, tablet id: "
372
91
                  << tablet->tablet_id() << " group data size: " << info.group_data_size
373
91
                  << " row num: " << info.rows << " consume bytes: " << info.bytes;
374
91
        return 1024 - 32;
375
91
    }
376
377
1
    if (group_data_size <= 0) {
378
0
        LOG(WARNING) << "estimate batch size for vertical compaction, tablet id: "
379
0
                     << tablet->tablet_id() << " unexpected group data size: " << group_data_size;
380
0
        return 4096 - 32;
381
0
    }
382
383
1
    tablet->sample_infos[group_index].bytes = 0;
384
1
    tablet->sample_infos[group_index].rows = 0;
385
386
1
    int64_t batch_size = block_mem_limit / group_data_size;
387
1
    int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L));
388
1
    LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " << tablet->tablet_id()
389
1
              << " group data size: " << info.group_data_size << " row num: " << info.rows
390
1
              << " consume bytes: " << info.bytes << " way cnt: " << way_cnt
391
1
              << " batch size: " << res;
392
1
    return res;
393
1
}
394
395
// steps to do vertical merge:
396
// 1. split columns into column groups
397
// 2. compact groups one by one, generate a row_source_buf when compact key group
398
// and use this row_source_buf to compact value column groups
399
// 3. build output rowset
400
Status Merger::vertical_merge_rowsets(TabletSharedPtr tablet, ReaderType reader_type,
401
                                      TabletSchemaSPtr tablet_schema,
402
                                      const std::vector<RowsetReaderSharedPtr>& src_rowset_readers,
403
                                      RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment,
404
48
                                      int64_t merge_way_num, Statistics* stats_output) {
405
48
    LOG(INFO) << "Start to do vertical compaction, tablet_id: " << tablet->tablet_id();
406
48
    std::vector<std::vector<uint32_t>> column_groups;
407
48
    vertical_split_columns(tablet_schema, &column_groups);
408
409
48
    std::vector<uint32_t> key_group_cluster_key_idxes;
410
48
    _generate_key_group_cluster_key_idxes(tablet_schema, column_groups,
411
48
                                          key_group_cluster_key_idxes);
412
413
48
    vectorized::RowSourcesBuffer row_sources_buf(tablet->tablet_id(), tablet->tablet_path(),
414
48
                                                 reader_type);
415
48
    {
416
48
        std::unique_lock<std::mutex> lock(tablet->sample_info_lock);
417
48
        tablet->sample_infos.resize(column_groups.size(), {0, 0, 0});
418
48
    }
419
    // compact group one by one
420
140
    for (auto i = 0; i < column_groups.size(); ++i) {
421
92
        VLOG_NOTICE << "row source size: " << row_sources_buf.total_size();
422
92
        bool is_key = (i == 0);
423
92
        int64_t batch_size = config::compaction_batch_size != -1
424
92
                                     ? config::compaction_batch_size
425
92
                                     : estimate_batch_size(i, tablet, merge_way_num);
426
92
        CompactionSampleInfo sample_info;
427
92
        Status st = vertical_compact_one_group(
428
92
                tablet, reader_type, tablet_schema, is_key, column_groups[i], &row_sources_buf,
429
92
                src_rowset_readers, dst_rowset_writer, max_rows_per_segment, stats_output,
430
92
                key_group_cluster_key_idxes, batch_size, &sample_info);
431
92
        {
432
92
            std::unique_lock<std::mutex> lock(tablet->sample_info_lock);
433
92
            tablet->sample_infos[i] = sample_info;
434
92
        }
435
92
        RETURN_IF_ERROR(st);
436
92
        if (is_key) {
437
48
            RETURN_IF_ERROR(row_sources_buf.flush());
438
48
        }
439
92
        RETURN_IF_ERROR(row_sources_buf.seek_to_begin());
440
92
    }
441
442
    // finish compact, build output rowset
443
48
    VLOG_NOTICE << "finish compact groups";
444
48
    RETURN_IF_ERROR(dst_rowset_writer->final_flush());
445
446
48
    return Status::OK();
447
48
}
448
449
void Merger::_generate_key_group_cluster_key_idxes(
450
        TabletSchemaSPtr tablet_schema, std::vector<std::vector<uint32_t>>& column_groups,
451
48
        std::vector<uint32_t>& key_group_cluster_key_idxes) {
452
48
    if (column_groups.size() > 0 && !tablet_schema->cluster_key_idxes().empty()) {
453
0
        auto& key_column_group = column_groups[0];
454
0
        for (const auto& index_in_tablet_schema : tablet_schema->cluster_key_idxes()) {
455
0
            for (auto j = 0; j < key_column_group.size(); ++j) {
456
0
                auto cid = key_column_group[j];
457
0
                if (cid == index_in_tablet_schema) {
458
0
                    key_group_cluster_key_idxes.emplace_back(j);
459
0
                    break;
460
0
                }
461
0
            }
462
0
        }
463
0
    }
464
48
}
465
466
} // namespace doris