be/src/storage/schema_change/schema_change.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/schema_change/schema_change.h" |
19 | | |
20 | | #include <gen_cpp/olap_file.pb.h> |
21 | | #include <glog/logging.h> |
22 | | #include <thrift/protocol/TDebugProtocol.h> |
23 | | |
24 | | #include <algorithm> |
25 | | #include <exception> |
26 | | #include <map> |
27 | | #include <memory> |
28 | | #include <mutex> |
29 | | #include <roaring/roaring.hh> |
30 | | #include <tuple> |
31 | | #include <utility> |
32 | | |
33 | | #include "agent/be_exec_version_manager.h" |
34 | | #include "cloud/cloud_schema_change_job.h" |
35 | | #include "cloud/config.h" |
36 | | #include "common/cast_set.h" |
37 | | #include "common/consts.h" |
38 | | #include "common/logging.h" |
39 | | #include "common/signal_handler.h" |
40 | | #include "common/status.h" |
41 | | #include "core/assert_cast.h" |
42 | | #include "core/block/block.h" |
43 | | #include "core/block/column_with_type_and_name.h" |
44 | | #include "core/column/column.h" |
45 | | #include "core/column/column_nullable.h" |
46 | | #include "exec/common/variant_util.h" |
47 | | #include "exprs/aggregate/aggregate_function.h" |
48 | | #include "exprs/aggregate/aggregate_function_reader.h" |
49 | | #include "exprs/vexpr.h" |
50 | | #include "exprs/vexpr_context.h" |
51 | | #include "information_schema/schema_metadata_name_ids_scanner.h" |
52 | | #include "io/fs/file_system.h" |
53 | | #include "io/io_common.h" |
54 | | #include "runtime/exec_env.h" |
55 | | #include "runtime/memory/mem_tracker.h" |
56 | | #include "runtime/runtime_state.h" |
57 | | #include "storage/data_dir.h" |
58 | | #include "storage/delete/delete_handler.h" |
59 | | #include "storage/field.h" |
60 | | #include "storage/index/inverted/inverted_index_desc.h" |
61 | | #include "storage/index/inverted/inverted_index_writer.h" |
62 | | #include "storage/iterator/olap_data_convertor.h" |
63 | | #include "storage/iterators.h" |
64 | | #include "storage/merger.h" |
65 | | #include "storage/olap_common.h" |
66 | | #include "storage/olap_define.h" |
67 | | #include "storage/rowset/beta_rowset.h" |
68 | | #include "storage/rowset/pending_rowset_helper.h" |
69 | | #include "storage/rowset/rowset_meta.h" |
70 | | #include "storage/rowset/rowset_reader_context.h" |
71 | | #include "storage/rowset/rowset_writer_context.h" |
72 | | #include "storage/schema.h" |
73 | | #include "storage/segment/column_reader.h" |
74 | | #include "storage/segment/segment.h" |
75 | | #include "storage/segment/segment_loader.h" |
76 | | #include "storage/storage_engine.h" |
77 | | #include "storage/tablet/base_tablet.h" |
78 | | #include "storage/tablet/tablet.h" |
79 | | #include "storage/tablet/tablet_fwd.h" |
80 | | #include "storage/tablet/tablet_manager.h" |
81 | | #include "storage/tablet/tablet_meta.h" |
82 | | #include "storage/tablet/tablet_schema.h" |
83 | | #include "storage/types.h" |
84 | | #include "storage/utils.h" |
85 | | #include "util/debug_points.h" |
86 | | #include "util/defer_op.h" |
87 | | #include "util/trace.h" |
88 | | |
89 | | namespace doris { |
90 | | |
91 | | class CollectionValue; |
92 | | |
93 | | using namespace ErrorCode; |
94 | | |
95 | | constexpr int ALTER_TABLE_BATCH_SIZE = 4064; |
96 | | |
97 | | class MultiBlockMerger { |
98 | | public: |
99 | 0 | MultiBlockMerger(BaseTabletSPtr tablet) : _tablet(tablet), _cmp(*tablet) {} |
100 | | |
101 | | Status merge(const std::vector<std::unique_ptr<Block>>& blocks, RowsetWriter* rowset_writer, |
102 | 0 | uint64_t* merged_rows) { |
103 | 0 | int rows = 0; |
104 | 0 | for (const auto& block : blocks) { |
105 | 0 | rows += block->rows(); |
106 | 0 | } |
107 | 0 | if (!rows) { |
108 | 0 | return Status::OK(); |
109 | 0 | } |
110 | | |
111 | 0 | std::vector<RowRef> row_refs; |
112 | 0 | row_refs.reserve(rows); |
113 | 0 | for (const auto& block : blocks) { |
114 | 0 | for (uint16_t i = 0; i < block->rows(); i++) { |
115 | 0 | row_refs.emplace_back(block.get(), i); |
116 | 0 | } |
117 | 0 | } |
118 | | // TODO: try to use pdqsort to replace std::sort |
119 | | // The block version is incremental. |
120 | 0 | std::stable_sort(row_refs.begin(), row_refs.end(), _cmp); |
121 | |
|
122 | 0 | auto finalized_block = _tablet->tablet_schema()->create_block(); |
123 | 0 | int columns = finalized_block.columns(); |
124 | 0 | *merged_rows += rows; |
125 | |
|
126 | 0 | if (_tablet->keys_type() == KeysType::AGG_KEYS) { |
127 | 0 | auto tablet_schema = _tablet->tablet_schema(); |
128 | 0 | int key_number = cast_set<int>(_tablet->num_key_columns()); |
129 | |
|
130 | 0 | std::vector<AggregateFunctionPtr> agg_functions; |
131 | 0 | std::vector<AggregateDataPtr> agg_places; |
132 | |
|
133 | 0 | for (int i = key_number; i < columns; i++) { |
134 | 0 | try { |
135 | 0 | AggregateFunctionPtr function = tablet_schema->column(i).get_aggregate_function( |
136 | 0 | AGG_LOAD_SUFFIX, tablet_schema->column(i).get_be_exec_version()); |
137 | 0 | if (!function) { |
138 | 0 | return Status::InternalError( |
139 | 0 | "could not find aggregate function on column {}, aggregation={}", |
140 | 0 | tablet_schema->column(i).name(), |
141 | 0 | tablet_schema->column(i).aggregation()); |
142 | 0 | } |
143 | 0 | agg_functions.push_back(function); |
144 | | // create aggregate data |
145 | 0 | auto* place = new char[function->size_of_data()]; |
146 | 0 | function->create(place); |
147 | 0 | agg_places.push_back(place); |
148 | 0 | } catch (...) { |
149 | 0 | for (int j = 0; j < i - key_number; ++j) { |
150 | 0 | agg_functions[j]->destroy(agg_places[j]); |
151 | 0 | delete[] agg_places[j]; |
152 | 0 | } |
153 | 0 | throw; |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | 0 | DEFER({ |
158 | 0 | for (int i = 0; i < columns - key_number; i++) { |
159 | 0 | agg_functions[i]->destroy(agg_places[i]); |
160 | 0 | delete[] agg_places[i]; |
161 | 0 | } |
162 | 0 | }); |
163 | |
|
164 | 0 | for (int i = 0; i < rows; i++) { |
165 | 0 | auto row_ref = row_refs[i]; |
166 | 0 | for (int j = key_number; j < columns; j++) { |
167 | 0 | const auto* column_ptr = row_ref.get_column(j).get(); |
168 | 0 | agg_functions[j - key_number]->add(agg_places[j - key_number], |
169 | 0 | const_cast<const IColumn**>(&column_ptr), |
170 | 0 | row_ref.position, _arena); |
171 | 0 | } |
172 | |
|
173 | 0 | if (i == rows - 1 || _cmp.compare(row_refs[i], row_refs[i + 1])) { |
174 | 0 | for (int j = 0; j < key_number; j++) { |
175 | 0 | finalized_block.get_by_position(j).column->assume_mutable()->insert_from( |
176 | 0 | *row_ref.get_column(j), row_ref.position); |
177 | 0 | } |
178 | |
|
179 | 0 | for (int j = key_number; j < columns; j++) { |
180 | 0 | agg_functions[j - key_number]->insert_result_into( |
181 | 0 | agg_places[j - key_number], |
182 | 0 | finalized_block.get_by_position(j).column->assume_mutable_ref()); |
183 | 0 | agg_functions[j - key_number]->reset(agg_places[j - key_number]); |
184 | 0 | } |
185 | |
|
186 | 0 | if (i == rows - 1 || finalized_block.rows() == ALTER_TABLE_BATCH_SIZE) { |
187 | 0 | *merged_rows -= finalized_block.rows(); |
188 | 0 | RETURN_IF_ERROR(rowset_writer->add_block(&finalized_block)); |
189 | 0 | finalized_block.clear_column_data(); |
190 | 0 | } |
191 | 0 | } |
192 | 0 | } |
193 | 0 | } else { |
194 | 0 | std::vector<RowRef> pushed_row_refs; |
195 | 0 | if (_tablet->keys_type() == KeysType::DUP_KEYS) { |
196 | 0 | std::swap(pushed_row_refs, row_refs); |
197 | 0 | } else if (_tablet->keys_type() == KeysType::UNIQUE_KEYS) { |
198 | 0 | for (int i = 0; i < rows; i++) { |
199 | 0 | if (i == rows - 1 || _cmp.compare(row_refs[i], row_refs[i + 1])) { |
200 | 0 | pushed_row_refs.push_back(row_refs[i]); |
201 | 0 | } |
202 | 0 | } |
203 | 0 | if (!_tablet->tablet_schema()->cluster_key_uids().empty()) { |
204 | 0 | std::vector<uint32_t> ids; |
205 | 0 | for (const auto& cid : _tablet->tablet_schema()->cluster_key_uids()) { |
206 | 0 | auto index = _tablet->tablet_schema()->field_index(cid); |
207 | 0 | if (index == -1) { |
208 | 0 | return Status::InternalError( |
209 | 0 | "could not find cluster key column with unique_id=" + |
210 | 0 | std::to_string(cid) + " in tablet schema"); |
211 | 0 | } |
212 | 0 | ids.push_back(index); |
213 | 0 | } |
214 | | // sort by cluster key |
215 | 0 | std::stable_sort(pushed_row_refs.begin(), pushed_row_refs.end(), |
216 | 0 | ClusterKeyRowRefComparator(ids)); |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | | // update real inserted row number |
221 | 0 | rows = cast_set<int>(pushed_row_refs.size()); |
222 | 0 | *merged_rows -= rows; |
223 | |
|
224 | 0 | for (int i = 0; i < rows; i += ALTER_TABLE_BATCH_SIZE) { |
225 | 0 | int limit = std::min(ALTER_TABLE_BATCH_SIZE, rows - i); |
226 | |
|
227 | 0 | for (int idx = 0; idx < columns; idx++) { |
228 | 0 | auto column = finalized_block.get_by_position(idx).column->assume_mutable(); |
229 | |
|
230 | 0 | for (int j = 0; j < limit; j++) { |
231 | 0 | auto row_ref = pushed_row_refs[i + j]; |
232 | 0 | column->insert_from(*row_ref.get_column(idx), row_ref.position); |
233 | 0 | } |
234 | 0 | } |
235 | 0 | RETURN_IF_ERROR(rowset_writer->add_block(&finalized_block)); |
236 | 0 | finalized_block.clear_column_data(); |
237 | 0 | } |
238 | 0 | } |
239 | | |
240 | 0 | RETURN_IF_ERROR(rowset_writer->flush()); |
241 | 0 | return Status::OK(); |
242 | 0 | } |
243 | | |
244 | | private: |
245 | | struct RowRef { |
246 | 0 | RowRef(Block* block_, uint16_t position_) : block(block_), position(position_) {} |
247 | 0 | ColumnPtr get_column(int index) const { return block->get_by_position(index).column; } |
248 | | const Block* block; |
249 | | uint16_t position; |
250 | | }; |
251 | | |
252 | | struct RowRefComparator { |
253 | 0 | RowRefComparator(const BaseTablet& tablet) : _num_columns(tablet.num_key_columns()) {} |
254 | | |
255 | 0 | int compare(const RowRef& lhs, const RowRef& rhs) const { |
256 | | // Notice: does not compare sequence column for mow table |
257 | | // read from rowsets with delete bitmap, so there should be no duplicated keys |
258 | 0 | return lhs.block->compare_at(lhs.position, rhs.position, _num_columns, *rhs.block, -1); |
259 | 0 | } |
260 | | |
261 | 0 | bool operator()(const RowRef& lhs, const RowRef& rhs) const { |
262 | 0 | return compare(lhs, rhs) < 0; |
263 | 0 | } |
264 | | |
265 | | const size_t _num_columns; |
266 | | }; |
267 | | |
268 | | struct ClusterKeyRowRefComparator { |
269 | 0 | ClusterKeyRowRefComparator(std::vector<uint32_t> columns) : _columns(columns) {} |
270 | | |
271 | 0 | int compare(const RowRef& lhs, const RowRef& rhs) const { |
272 | 0 | return lhs.block->compare_at(lhs.position, rhs.position, &_columns, *rhs.block, -1); |
273 | 0 | } |
274 | | |
275 | 0 | bool operator()(const RowRef& lhs, const RowRef& rhs) const { |
276 | 0 | return compare(lhs, rhs) < 0; |
277 | 0 | } |
278 | | |
279 | | const std::vector<uint32_t> _columns; |
280 | | }; |
281 | | |
282 | | BaseTabletSPtr _tablet; |
283 | | RowRefComparator _cmp; |
284 | | Arena _arena; |
285 | | }; |
286 | | |
287 | | BlockChanger::BlockChanger(TabletSchemaSPtr tablet_schema, DescriptorTbl desc_tbl, |
288 | | std::shared_ptr<RuntimeState> state) |
289 | 0 | : _desc_tbl(std::move(desc_tbl)), _state(std::move(state)) { |
290 | 0 | CHECK(_state != nullptr); |
291 | 0 | _schema_mapping.resize(tablet_schema->num_columns()); |
292 | 0 | } |
293 | | |
294 | 0 | BlockChanger::~BlockChanger() { |
295 | 0 | _schema_mapping.clear(); |
296 | 0 | } |
297 | | |
298 | 0 | ColumnMapping* BlockChanger::get_mutable_column_mapping(size_t column_index) { |
299 | 0 | if (column_index >= _schema_mapping.size()) { |
300 | 0 | return nullptr; |
301 | 0 | } |
302 | | |
303 | 0 | return &(_schema_mapping[column_index]); |
304 | 0 | } |
305 | | |
306 | 0 | Status BlockChanger::change_block(Block* ref_block, Block* new_block) const { |
307 | | // for old version request compatibility |
308 | 0 | _state->set_desc_tbl(&_desc_tbl); |
309 | 0 | _state->set_be_exec_version(_fe_compatible_version); |
310 | 0 | RowDescriptor row_desc = |
311 | 0 | RowDescriptor(_desc_tbl.get_tuple_descriptor(_desc_tbl.get_row_tuples()[0])); |
312 | |
|
313 | 0 | if (_where_expr != nullptr) { |
314 | 0 | VExprContextSPtr ctx = nullptr; |
315 | 0 | RETURN_IF_ERROR(VExpr::create_expr_tree(*_where_expr, ctx)); |
316 | 0 | RETURN_IF_ERROR(ctx->prepare(_state.get(), row_desc)); |
317 | 0 | RETURN_IF_ERROR(ctx->open(_state.get())); |
318 | | |
319 | 0 | RETURN_IF_ERROR(VExprContext::filter_block(ctx.get(), ref_block)); |
320 | 0 | } |
321 | | |
322 | 0 | const int row_num = cast_set<int>(ref_block->rows()); |
323 | 0 | const int new_schema_cols_num = new_block->columns(); |
324 | | |
325 | | // will be used for swaping ref_block[entry.first] and new_block[entry.second] |
326 | 0 | std::list<std::pair<int, int>> swap_idx_list; |
327 | 0 | for (int idx = 0; idx < new_schema_cols_num; idx++) { |
328 | 0 | auto expr = _schema_mapping[idx].expr; |
329 | 0 | if (expr != nullptr) { |
330 | 0 | VExprContextSPtr ctx; |
331 | 0 | RETURN_IF_ERROR(VExpr::create_expr_tree(*expr, ctx)); |
332 | 0 | RETURN_IF_ERROR(ctx->prepare(_state.get(), row_desc)); |
333 | 0 | RETURN_IF_ERROR(ctx->open(_state.get())); |
334 | | |
335 | 0 | int result_tmp_column_idx = -1; |
336 | 0 | RETURN_IF_ERROR(ctx->execute(ref_block, &result_tmp_column_idx)); |
337 | 0 | auto& result_tmp_column_def = ref_block->get_by_position(result_tmp_column_idx); |
338 | 0 | if (!result_tmp_column_def.column) { |
339 | 0 | return Status::Error<ErrorCode::INTERNAL_ERROR>( |
340 | 0 | "result column={} is nullptr, input expr={}", result_tmp_column_def.name, |
341 | 0 | apache::thrift::ThriftDebugString(*expr)); |
342 | 0 | } |
343 | 0 | ref_block->replace_by_position_if_const(result_tmp_column_idx); |
344 | |
|
345 | 0 | if (result_tmp_column_def.column->size() != row_num) { |
346 | 0 | return Status::Error<ErrorCode::INTERNAL_ERROR>( |
347 | 0 | "result size invalid, expect={}, real={}; input expr={}, block={}", row_num, |
348 | 0 | result_tmp_column_def.column->size(), |
349 | 0 | apache::thrift::ThriftDebugString(*expr), ref_block->dump_structure()); |
350 | 0 | } |
351 | | |
352 | 0 | auto lhs = _schema_mapping[idx].new_column->get_vec_type()->get_primitive_type(); |
353 | 0 | auto rhs = result_tmp_column_def.type->get_primitive_type(); |
354 | 0 | if (is_string_type(lhs) != is_string_type(rhs) && lhs != rhs) { |
355 | 0 | return Status::Error<ErrorCode::INTERNAL_ERROR>( |
356 | 0 | "result type invalid, expect={}, real={}; input expr={}, block={}", |
357 | 0 | _schema_mapping[idx].new_column->get_vec_type()->get_name(), |
358 | 0 | result_tmp_column_def.type->get_name(), |
359 | 0 | apache::thrift::ThriftDebugString(*expr), ref_block->dump_structure()); |
360 | 0 | } |
361 | | |
362 | 0 | if (_type == SCHEMA_CHANGE) { |
363 | | // danger casts (expected to be rejected by upstream caller) may cause data to be null and result in data loss in schema change |
364 | | // for rollup, this check is unecessary, and ref columns are not set in this case, it works on exprs |
365 | | |
366 | | // column_idx in base schema |
367 | 0 | int32_t ref_column_idx = _schema_mapping[idx].ref_column_idx; |
368 | 0 | DCHECK_GE(ref_column_idx, 0); |
369 | 0 | auto& ref_column_def = ref_block->get_by_position(ref_column_idx); |
370 | 0 | RETURN_IF_ERROR( |
371 | 0 | _check_cast_valid(ref_column_def.column, result_tmp_column_def.column)); |
372 | 0 | } |
373 | 0 | swap_idx_list.emplace_back(result_tmp_column_idx, idx); |
374 | 0 | } else if (_schema_mapping[idx].ref_column_idx < 0) { |
375 | | // new column, write default value |
376 | 0 | const auto& value = _schema_mapping[idx].default_value; |
377 | 0 | auto column = new_block->get_by_position(idx).column->assume_mutable(); |
378 | 0 | if (value.is_null()) { |
379 | 0 | DCHECK(column->is_nullable()); |
380 | 0 | column->insert_many_defaults(row_num); |
381 | 0 | } else { |
382 | 0 | column = column->convert_to_predicate_column_if_dictionary(); |
383 | 0 | column->insert_duplicate_fields(value, row_num); |
384 | 0 | } |
385 | 0 | } else { |
386 | | // same type, just swap column |
387 | 0 | swap_idx_list.emplace_back(_schema_mapping[idx].ref_column_idx, idx); |
388 | 0 | } |
389 | 0 | } |
390 | | |
391 | 0 | for (auto it : swap_idx_list) { |
392 | 0 | auto& ref_col = ref_block->get_by_position(it.first).column; |
393 | 0 | auto& new_col = new_block->get_by_position(it.second).column; |
394 | |
|
395 | 0 | bool ref_col_nullable = ref_col->is_nullable(); |
396 | 0 | bool new_col_nullable = new_col->is_nullable(); |
397 | |
|
398 | 0 | if (ref_col_nullable != new_col_nullable) { |
399 | | // not nullable to nullable |
400 | 0 | if (new_col_nullable) { |
401 | 0 | auto* new_nullable_col = |
402 | 0 | assert_cast<ColumnNullable*>(new_col->assume_mutable().get()); |
403 | |
|
404 | 0 | new_nullable_col->change_nested_column(ref_col); |
405 | 0 | new_nullable_col->get_null_map_data().resize_fill(ref_col->size()); |
406 | 0 | } else { |
407 | | // nullable to not nullable: |
408 | | // suppose column `c_phone` is originally varchar(16) NOT NULL, |
409 | | // then do schema change `alter table test modify column c_phone int not null`, |
410 | | // the cast expr of schema change is `CastExpr(CAST String to Nullable(Int32))`, |
411 | | // so need to handle nullable to not nullable here |
412 | 0 | auto* ref_nullable_col = |
413 | 0 | assert_cast<ColumnNullable*>(ref_col->assume_mutable().get()); |
414 | |
|
415 | 0 | new_col = ref_nullable_col->get_nested_column_ptr(); |
416 | 0 | } |
417 | 0 | } else { |
418 | 0 | new_block->get_by_position(it.second).column = |
419 | 0 | ref_block->get_by_position(it.first).column; |
420 | 0 | } |
421 | 0 | } |
422 | 0 | return Status::OK(); |
423 | 0 | } |
424 | | |
425 | | // This check can prevent schema-change from causing data loss after type cast |
426 | 0 | Status BlockChanger::_check_cast_valid(ColumnPtr input_column, ColumnPtr output_column) { |
427 | 0 | if (input_column->size() != output_column->size()) { |
428 | 0 | return Status::InternalError( |
429 | 0 | "column size is changed, input_column_size={}, output_column_size={}; " |
430 | 0 | "input_column={}", |
431 | 0 | input_column->size(), output_column->size(), input_column->get_name()); |
432 | 0 | } |
433 | 0 | DCHECK_EQ(input_column->size(), output_column->size()) |
434 | 0 | << "length check should have done before calling this function!"; |
435 | |
|
436 | 0 | if (input_column->is_nullable() != output_column->is_nullable()) { |
437 | 0 | if (input_column->is_nullable()) { |
438 | 0 | const auto* ref_null_map = check_and_get_column<ColumnNullable>(input_column.get()) |
439 | 0 | ->get_null_map_column() |
440 | 0 | .get_data() |
441 | 0 | .data(); |
442 | |
|
443 | 0 | bool is_changed = false; |
444 | 0 | for (size_t i = 0; i < input_column->size(); i++) { |
445 | 0 | is_changed |= ref_null_map[i]; |
446 | 0 | } |
447 | 0 | if (is_changed) { |
448 | 0 | return Status::DataQualityError( |
449 | 0 | "some null data is changed to not null, intput_column={}", |
450 | 0 | input_column->get_name()); |
451 | 0 | } |
452 | 0 | } else { |
453 | 0 | const auto& null_map_column = check_and_get_column<ColumnNullable>(output_column.get()) |
454 | 0 | ->get_null_map_column(); |
455 | 0 | const auto& nested_column = |
456 | 0 | check_and_get_column<ColumnNullable>(output_column.get())->get_nested_column(); |
457 | 0 | const auto* new_null_map = null_map_column.get_data().data(); |
458 | |
|
459 | 0 | if (null_map_column.size() != output_column->size()) { |
460 | 0 | return Status::InternalError( |
461 | 0 | "null_map_column size mismatch output_column_size, " |
462 | 0 | "null_map_column_size={}, output_column_size={}; input_column={}", |
463 | 0 | null_map_column.size(), output_column->size(), input_column->get_name()); |
464 | 0 | } |
465 | | |
466 | 0 | if (nested_column.size() != output_column->size()) { |
467 | 0 | return Status::InternalError( |
468 | 0 | "nested_column size is changed, nested_column_size={}, " |
469 | 0 | "ouput_column_size={}; input_column={}", |
470 | 0 | nested_column.size(), output_column->size(), input_column->get_name()); |
471 | 0 | } |
472 | | |
473 | 0 | bool is_changed = false; |
474 | 0 | for (size_t i = 0; i < input_column->size(); i++) { |
475 | 0 | is_changed |= new_null_map[i]; |
476 | 0 | } |
477 | 0 | if (is_changed) { |
478 | 0 | return Status::DataQualityError( |
479 | 0 | "some not null data is changed to null, intput_column={}", |
480 | 0 | input_column->get_name()); |
481 | 0 | } |
482 | 0 | } |
483 | 0 | } |
484 | | |
485 | 0 | if (input_column->is_nullable() && output_column->is_nullable()) { |
486 | 0 | const auto* ref_null_map = check_and_get_column<ColumnNullable>(input_column.get()) |
487 | 0 | ->get_null_map_column() |
488 | 0 | .get_data() |
489 | 0 | .data(); |
490 | 0 | const auto* new_null_map = check_and_get_column<ColumnNullable>(output_column.get()) |
491 | 0 | ->get_null_map_column() |
492 | 0 | .get_data() |
493 | 0 | .data(); |
494 | |
|
495 | 0 | bool is_changed = false; |
496 | 0 | for (size_t i = 0; i < input_column->size(); i++) { |
497 | 0 | is_changed |= (ref_null_map[i] != new_null_map[i]); |
498 | 0 | } |
499 | 0 | if (is_changed) { |
500 | 0 | return Status::DataQualityError( |
501 | 0 | "null map is changed after calculation, input_column={}", |
502 | 0 | input_column->get_name()); |
503 | 0 | } |
504 | 0 | } |
505 | 0 | return Status::OK(); |
506 | 0 | } |
507 | | |
508 | | Status LinkedSchemaChange::process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* rowset_writer, |
509 | | BaseTabletSPtr new_tablet, BaseTabletSPtr base_tablet, |
510 | | TabletSchemaSPtr base_tablet_schema, |
511 | 0 | TabletSchemaSPtr new_tablet_schema) { |
512 | 0 | Status status = rowset_writer->add_rowset_for_linked_schema_change(rowset_reader->rowset()); |
513 | 0 | if (!status) { |
514 | 0 | LOG(WARNING) << "fail to convert rowset." |
515 | 0 | << ", new_tablet=" << new_tablet->tablet_id() |
516 | 0 | << ", version=" << rowset_writer->version().first << "-" |
517 | 0 | << rowset_writer->version().second << ", error status " << status; |
518 | 0 | return status; |
519 | 0 | } |
520 | | // copy delete bitmap to new tablet. |
521 | 0 | if (new_tablet->keys_type() == UNIQUE_KEYS && new_tablet->enable_unique_key_merge_on_write()) { |
522 | 0 | DeleteBitmap origin_delete_bitmap(base_tablet->tablet_id()); |
523 | 0 | base_tablet->tablet_meta()->delete_bitmap().subset( |
524 | 0 | {rowset_reader->rowset()->rowset_id(), 0, 0}, |
525 | 0 | {rowset_reader->rowset()->rowset_id(), UINT32_MAX, INT64_MAX}, |
526 | 0 | &origin_delete_bitmap); |
527 | 0 | for (auto& iter : origin_delete_bitmap.delete_bitmap) { |
528 | 0 | int ret = new_tablet->tablet_meta()->delete_bitmap().set( |
529 | 0 | {rowset_writer->rowset_id(), std::get<1>(iter.first), std::get<2>(iter.first)}, |
530 | 0 | iter.second); |
531 | 0 | DCHECK(ret == 1); |
532 | 0 | } |
533 | 0 | } |
534 | 0 | return Status::OK(); |
535 | 0 | } |
536 | | |
537 | | Status next_batch(RowsetReaderSharedPtr rowset_reader, Block* input_block, |
538 | 0 | std::vector<bool>& row_same_bit) { |
539 | 0 | Status st; |
540 | 0 | if (rowset_reader->is_merge_iterator()) { |
541 | 0 | row_same_bit.clear(); |
542 | 0 | BlockWithSameBit block_with_same_bit = {.block = input_block, .same_bit = row_same_bit}; |
543 | 0 | st = rowset_reader->next_batch(&block_with_same_bit); |
544 | | // todo: use row_same_bit to clean some useless row |
545 | 0 | } else { |
546 | 0 | st = rowset_reader->next_batch(input_block); |
547 | 0 | } |
548 | 0 | return st; |
549 | 0 | } |
550 | | |
551 | | Status VSchemaChangeDirectly::_inner_process(RowsetReaderSharedPtr rowset_reader, |
552 | | RowsetWriter* rowset_writer, BaseTabletSPtr new_tablet, |
553 | | TabletSchemaSPtr base_tablet_schema, |
554 | 0 | TabletSchemaSPtr new_tablet_schema) { |
555 | 0 | bool eof = false; |
556 | 0 | do { |
557 | 0 | auto new_block = Block::create_unique(new_tablet_schema->create_block()); |
558 | | // create_block() skips dropped columns (from light-weight schema change). |
559 | | // Dropped columns are only needed for delete predicate evaluation, which |
560 | | // SegmentIterator handles internally — it creates temporary columns for |
561 | | // predicate columns not present in the block (via `i >= block->columns()` |
562 | | // guard in _init_current_block). If dropped columns were included here, |
563 | | // the block would have more columns than VMergeIterator's output_schema |
564 | | // expects, causing DCHECK failures in copy_rows. |
565 | 0 | auto ref_block = Block::create_unique(base_tablet_schema->create_block()); |
566 | |
|
567 | 0 | Status st = next_batch(rowset_reader, ref_block.get(), _row_same_bit); |
568 | 0 | if (!st) { |
569 | 0 | if (st.is<ErrorCode::END_OF_FILE>()) { |
570 | 0 | if (ref_block->rows() == 0) { |
571 | 0 | break; |
572 | 0 | } else { |
573 | 0 | eof = true; |
574 | 0 | } |
575 | 0 | } else { |
576 | 0 | return st; |
577 | 0 | } |
578 | 0 | } |
579 | | |
580 | 0 | RETURN_IF_ERROR(_changer.change_block(ref_block.get(), new_block.get())); |
581 | 0 | RETURN_IF_ERROR(rowset_writer->add_block(new_block.get())); |
582 | 0 | } while (!eof); |
583 | | |
584 | 0 | RETURN_IF_ERROR(rowset_writer->flush()); |
585 | 0 | return Status::OK(); |
586 | 0 | } |
587 | | |
588 | | VBaseSchemaChangeWithSorting::VBaseSchemaChangeWithSorting(const BlockChanger& changer, |
589 | | size_t memory_limitation) |
590 | 0 | : _changer(changer), |
591 | 0 | _memory_limitation(memory_limitation), |
592 | 0 | _temp_delta_versions(Version::mock()) { |
593 | 0 | _mem_tracker = std::make_unique<MemTracker>( |
594 | 0 | fmt::format("VSchemaChangeWithSorting:changer={}", std::to_string(int64_t(&changer)))); |
595 | 0 | } |
596 | | |
597 | | Status VBaseSchemaChangeWithSorting::_inner_process(RowsetReaderSharedPtr rowset_reader, |
598 | | RowsetWriter* rowset_writer, |
599 | | BaseTabletSPtr new_tablet, |
600 | | TabletSchemaSPtr base_tablet_schema, |
601 | 0 | TabletSchemaSPtr new_tablet_schema) { |
602 | | // for internal sorting |
603 | 0 | std::vector<std::unique_ptr<Block>> blocks; |
604 | |
|
605 | 0 | RowsetSharedPtr rowset = rowset_reader->rowset(); |
606 | 0 | SegmentsOverlapPB segments_overlap = rowset->rowset_meta()->segments_overlap(); |
607 | 0 | int64_t newest_write_timestamp = rowset->newest_write_timestamp(); |
608 | 0 | _temp_delta_versions.first = _temp_delta_versions.second; |
609 | 0 | _src_rowsets.clear(); // init _src_rowsets |
610 | 0 | auto create_rowset = [&]() -> Status { |
611 | 0 | if (blocks.empty()) { |
612 | 0 | return Status::OK(); |
613 | 0 | } |
614 | | |
615 | 0 | auto rowset = DORIS_TRY(_internal_sorting( |
616 | 0 | blocks, Version(_temp_delta_versions.second, _temp_delta_versions.second + 1), |
617 | 0 | newest_write_timestamp, new_tablet, BETA_ROWSET, segments_overlap, |
618 | 0 | new_tablet_schema)); |
619 | 0 | _src_rowsets.push_back(std::move(rowset)); |
620 | 0 | for (auto& block : blocks) { |
621 | 0 | _mem_tracker->release(block->allocated_bytes()); |
622 | 0 | } |
623 | 0 | blocks.clear(); |
624 | | |
625 | | // increase temp version |
626 | 0 | _temp_delta_versions.second += 2; |
627 | 0 | return Status::OK(); |
628 | 0 | }; |
629 | |
|
630 | 0 | auto new_block = Block::create_unique(new_tablet_schema->create_block()); |
631 | |
|
632 | 0 | bool eof = false; |
633 | 0 | do { |
634 | | // create_block() skips dropped columns (from light-weight schema change). |
635 | | // Dropped columns are only needed for delete predicate evaluation, which |
636 | | // SegmentIterator handles internally — it creates temporary columns for |
637 | | // predicate columns not present in the block (via `i >= block->columns()` |
638 | | // guard in _init_current_block). If dropped columns were included here, |
639 | | // the block would have more columns than VMergeIterator's output_schema |
640 | | // expects, causing DCHECK failures in copy_rows. |
641 | 0 | auto ref_block = Block::create_unique(base_tablet_schema->create_block()); |
642 | 0 | Status st = next_batch(rowset_reader, ref_block.get(), _row_same_bit); |
643 | 0 | if (!st) { |
644 | 0 | if (st.is<ErrorCode::END_OF_FILE>()) { |
645 | 0 | if (ref_block->rows() == 0) { |
646 | 0 | break; |
647 | 0 | } else { |
648 | 0 | eof = true; |
649 | 0 | } |
650 | 0 | } else { |
651 | 0 | return st; |
652 | 0 | } |
653 | 0 | } |
654 | | |
655 | 0 | RETURN_IF_ERROR(_changer.change_block(ref_block.get(), new_block.get())); |
656 | | |
657 | 0 | constexpr double HOLD_BLOCK_MEMORY_RATE = |
658 | 0 | 0.66; // Reserve some memory for use by other parts of this job |
659 | 0 | if (_mem_tracker->consumption() + new_block->allocated_bytes() > _memory_limitation || |
660 | 0 | cast_set<double>(_mem_tracker->consumption()) > |
661 | 0 | cast_set<double>(_memory_limitation) * HOLD_BLOCK_MEMORY_RATE || |
662 | 0 | DebugPoints::instance()->is_enable( |
663 | 0 | "VBaseSchemaChangeWithSorting._inner_process.create_rowset")) { |
664 | 0 | RETURN_IF_ERROR(create_rowset()); |
665 | | |
666 | 0 | if (_mem_tracker->consumption() + new_block->allocated_bytes() > _memory_limitation) { |
667 | 0 | return Status::Error<INVALID_ARGUMENT>( |
668 | 0 | "Memory limitation is too small for Schema Change. _memory_limitation={}, " |
669 | 0 | "new_block->allocated_bytes()={}, consumption={}", |
670 | 0 | _memory_limitation, new_block->allocated_bytes(), |
671 | 0 | _mem_tracker->consumption()); |
672 | 0 | } |
673 | 0 | } |
674 | 0 | _mem_tracker->consume(new_block->allocated_bytes()); |
675 | | |
676 | | // move unique ptr |
677 | 0 | blocks.push_back(Block::create_unique(new_tablet_schema->create_block())); |
678 | 0 | swap(blocks.back(), new_block); |
679 | 0 | } while (!eof); |
680 | | |
681 | 0 | RETURN_IF_ERROR(create_rowset()); |
682 | | |
683 | 0 | if (_src_rowsets.empty()) { |
684 | 0 | RETURN_IF_ERROR(rowset_writer->flush()); |
685 | 0 | } else { |
686 | 0 | RETURN_IF_ERROR( |
687 | 0 | _external_sorting(_src_rowsets, rowset_writer, new_tablet, new_tablet_schema)); |
688 | 0 | } |
689 | | |
690 | 0 | return Status::OK(); |
691 | 0 | } |
692 | | |
693 | | Result<RowsetSharedPtr> VBaseSchemaChangeWithSorting::_internal_sorting( |
694 | | const std::vector<std::unique_ptr<Block>>& blocks, const Version& version, |
695 | | int64_t newest_write_timestamp, BaseTabletSPtr new_tablet, RowsetTypePB new_rowset_type, |
696 | 0 | SegmentsOverlapPB segments_overlap, TabletSchemaSPtr new_tablet_schema) { |
697 | 0 | uint64_t merged_rows = 0; |
698 | 0 | MultiBlockMerger merger(new_tablet); |
699 | 0 | RowsetWriterContext context; |
700 | 0 | context.version = version; |
701 | 0 | context.rowset_state = VISIBLE; |
702 | 0 | context.segments_overlap = segments_overlap; |
703 | 0 | context.tablet_schema = new_tablet_schema; |
704 | 0 | context.newest_write_timestamp = newest_write_timestamp; |
705 | 0 | context.write_type = DataWriteType::TYPE_SCHEMA_CHANGE; |
706 | 0 | context.allow_packed_file = false; |
707 | 0 | std::unique_ptr<RowsetWriter> rowset_writer; |
708 | | // TODO(plat1ko): Use monad op |
709 | 0 | if (auto result = new_tablet->create_rowset_writer(context, false); !result.has_value()) |
710 | 0 | [[unlikely]] { |
711 | 0 | return unexpected(std::move(result).error()); |
712 | 0 | } else { |
713 | 0 | rowset_writer = std::move(result).value(); |
714 | 0 | } |
715 | 0 | RETURN_IF_ERROR_RESULT(merger.merge(blocks, rowset_writer.get(), &merged_rows)); |
716 | 0 | _add_merged_rows(merged_rows); |
717 | 0 | RowsetSharedPtr rowset; |
718 | 0 | RETURN_IF_ERROR_RESULT(rowset_writer->build(rowset)); |
719 | 0 | return rowset; |
720 | 0 | } |
721 | | |
722 | | Result<RowsetSharedPtr> VLocalSchemaChangeWithSorting::_internal_sorting( |
723 | | const std::vector<std::unique_ptr<Block>>& blocks, const Version& version, |
724 | | int64_t newest_write_timestamp, BaseTabletSPtr new_tablet, RowsetTypePB new_rowset_type, |
725 | 0 | SegmentsOverlapPB segments_overlap, TabletSchemaSPtr new_tablet_schema) { |
726 | 0 | uint64_t merged_rows = 0; |
727 | 0 | MultiBlockMerger merger(new_tablet); |
728 | 0 | RowsetWriterContext context; |
729 | 0 | context.version = version; |
730 | 0 | context.rowset_state = VISIBLE; |
731 | 0 | context.segments_overlap = segments_overlap; |
732 | 0 | context.tablet_schema = new_tablet_schema; |
733 | 0 | context.newest_write_timestamp = newest_write_timestamp; |
734 | 0 | context.write_type = DataWriteType::TYPE_SCHEMA_CHANGE; |
735 | 0 | context.allow_packed_file = false; |
736 | 0 | std::unique_ptr<RowsetWriter> rowset_writer; |
737 | | // TODO(plat1ko): Use monad op |
738 | 0 | if (auto result = new_tablet->create_rowset_writer(context, false); !result.has_value()) |
739 | 0 | [[unlikely]] { |
740 | 0 | return unexpected(std::move(result).error()); |
741 | 0 | } else { |
742 | 0 | rowset_writer = std::move(result).value(); |
743 | 0 | } |
744 | 0 | auto guard = _local_storage_engine.pending_local_rowsets().add(context.rowset_id); |
745 | 0 | _pending_rs_guards.push_back(std::move(guard)); |
746 | 0 | RETURN_IF_ERROR_RESULT(merger.merge(blocks, rowset_writer.get(), &merged_rows)); |
747 | 0 | _add_merged_rows(merged_rows); |
748 | 0 | RowsetSharedPtr rowset; |
749 | 0 | RETURN_IF_ERROR_RESULT(rowset_writer->build(rowset)); |
750 | 0 | return rowset; |
751 | 0 | } |
752 | | |
753 | | Status VBaseSchemaChangeWithSorting::_external_sorting(std::vector<RowsetSharedPtr>& src_rowsets, |
754 | | RowsetWriter* rowset_writer, |
755 | | BaseTabletSPtr new_tablet, |
756 | 0 | TabletSchemaSPtr new_tablet_schema) { |
757 | 0 | std::vector<RowsetReaderSharedPtr> rs_readers; |
758 | 0 | for (auto& rowset : src_rowsets) { |
759 | 0 | RowsetReaderSharedPtr rs_reader; |
760 | 0 | RETURN_IF_ERROR(rowset->create_reader(&rs_reader)); |
761 | 0 | rs_readers.push_back(rs_reader); |
762 | 0 | } |
763 | | |
764 | 0 | Merger::Statistics stats; |
765 | 0 | if (!new_tablet_schema->cluster_key_uids().empty()) { |
766 | | // schema change read rowsets with delete bitmap, so there should be no duplicated keys |
767 | | // RETURN_IF_ERROR(Compaction::update_delete_bitmap()); |
768 | 0 | int64_t way_num = 0; |
769 | 0 | int64_t input_rowsets_data_size = 0; |
770 | 0 | int64_t input_row_num = 0; |
771 | 0 | for (auto& rowset : src_rowsets) { |
772 | 0 | way_num += rowset->rowset_meta()->get_merge_way_num(); |
773 | 0 | input_rowsets_data_size += rowset->data_disk_size(); |
774 | 0 | input_row_num += rowset->num_rows(); |
775 | 0 | } |
776 | 0 | int64_t avg_segment_rows = config::vertical_compaction_max_segment_size / |
777 | 0 | (input_rowsets_data_size / (input_row_num + 1) + 1); |
778 | 0 | RETURN_IF_ERROR(Merger::vertical_merge_rowsets( |
779 | 0 | new_tablet, ReaderType::READER_ALTER_TABLE, *new_tablet_schema, rs_readers, |
780 | 0 | rowset_writer, cast_set<uint32_t>(avg_segment_rows), way_num, &stats)); |
781 | 0 | } else { |
782 | 0 | RETURN_IF_ERROR(Merger::vmerge_rowsets(new_tablet, ReaderType::READER_ALTER_TABLE, |
783 | 0 | *new_tablet_schema, rs_readers, rowset_writer, |
784 | 0 | &stats)); |
785 | 0 | } |
786 | 0 | _add_merged_rows(stats.merged_rows); |
787 | 0 | _add_filtered_rows(stats.filtered_rows); |
788 | 0 | return Status::OK(); |
789 | 0 | } |
790 | | |
791 | | Status VLocalSchemaChangeWithSorting::_inner_process(RowsetReaderSharedPtr rowset_reader, |
792 | | RowsetWriter* rowset_writer, |
793 | | BaseTabletSPtr new_tablet, |
794 | | TabletSchemaSPtr base_tablet_schema, |
795 | 0 | TabletSchemaSPtr new_tablet_schema) { |
796 | 0 | Defer defer {[&]() { |
797 | | // remove the intermediate rowsets generated by internal sorting |
798 | 0 | for (auto& row_set : _src_rowsets) { |
799 | 0 | _local_storage_engine.add_unused_rowset(row_set); |
800 | 0 | } |
801 | 0 | }}; |
802 | 0 | _pending_rs_guards.clear(); |
803 | 0 | return VBaseSchemaChangeWithSorting::_inner_process(rowset_reader, rowset_writer, new_tablet, |
804 | 0 | base_tablet_schema, new_tablet_schema); |
805 | 0 | } |
806 | | |
807 | 0 | Status SchemaChangeJob::process_alter_tablet(const TAlterTabletReqV2& request) { |
808 | 0 | if (!request.__isset.desc_tbl) { |
809 | 0 | return Status::Error<INVALID_ARGUMENT>( |
810 | 0 | "desc_tbl is not set. Maybe the FE version is not equal to the BE " |
811 | 0 | "version."); |
812 | 0 | } |
813 | 0 | if (_base_tablet == nullptr) { |
814 | 0 | return Status::Error<TABLE_NOT_FOUND>("fail to find base tablet. base_tablet={}", |
815 | 0 | request.base_tablet_id); |
816 | 0 | } |
817 | 0 | if (_new_tablet == nullptr) { |
818 | 0 | return Status::Error<TABLE_NOT_FOUND>("fail to find new tablet. new_tablet={}", |
819 | 0 | request.new_tablet_id); |
820 | 0 | } |
821 | | |
822 | 0 | LOG(INFO) << "begin to do request alter tablet: base_tablet_id=" << request.base_tablet_id |
823 | 0 | << ", new_tablet_id=" << request.new_tablet_id |
824 | 0 | << ", alter_version=" << request.alter_version; |
825 | | |
826 | | // Lock schema_change_lock util schema change info is stored in tablet header |
827 | 0 | static constexpr long TRY_LOCK_TIMEOUT = 30; |
828 | 0 | std::unique_lock schema_change_lock(_base_tablet->get_schema_change_lock(), std::defer_lock); |
829 | 0 | bool owns_lock = schema_change_lock.try_lock_for(std::chrono::seconds(TRY_LOCK_TIMEOUT)); |
830 | |
|
831 | 0 | if (!owns_lock) { |
832 | 0 | return Status::Error<TRY_LOCK_FAILED>( |
833 | 0 | "Failed to obtain schema change lock, there might be inverted index being " |
834 | 0 | "built or cooldown runnning on base_tablet={}", |
835 | 0 | request.base_tablet_id); |
836 | 0 | } |
837 | | |
838 | 0 | Status res = _do_process_alter_tablet(request); |
839 | 0 | LOG(INFO) << "finished alter tablet process, res=" << res; |
840 | 0 | DBUG_EXECUTE_IF("SchemaChangeJob::process_alter_tablet.leave.sleep", { sleep(5); }); |
841 | 0 | return res; |
842 | 0 | } |
843 | | |
844 | | SchemaChangeJob::SchemaChangeJob(StorageEngine& local_storage_engine, |
845 | | const TAlterTabletReqV2& request, const std::string& job_id) |
846 | 0 | : _local_storage_engine(local_storage_engine) { |
847 | 0 | _base_tablet = _local_storage_engine.tablet_manager()->get_tablet(request.base_tablet_id); |
848 | 0 | _new_tablet = _local_storage_engine.tablet_manager()->get_tablet(request.new_tablet_id); |
849 | 0 | if (_base_tablet && _new_tablet) { |
850 | 0 | _base_tablet_schema = std::make_shared<TabletSchema>(); |
851 | 0 | _base_tablet_schema->update_tablet_columns(*_base_tablet->tablet_schema(), request.columns); |
852 | | // The request only include column info, do not include bitmap or bloomfilter index info, |
853 | | // So we also need to copy index info from the real base tablet |
854 | 0 | _base_tablet_schema->update_index_info_from(*_base_tablet->tablet_schema()); |
855 | | // During a schema change, the extracted columns of a variant should not be included in the tablet schema. |
856 | | // This is because the schema change for a variant needs to ignore the extracted columns. |
857 | | // Otherwise, the schema types in different rowsets might be inconsistent. When performing a schema change, |
858 | | // the complete variant is constructed by reading all the sub-columns of the variant. |
859 | 0 | _new_tablet_schema = _new_tablet->tablet_schema()->copy_without_variant_extracted_columns(); |
860 | 0 | } |
861 | 0 | _job_id = job_id; |
862 | 0 | } |
863 | | |
864 | | // In the past schema change and rollup will create new tablet and will wait for txns starting before the task to finished |
865 | | // It will cost a lot of time to wait and the task is very difficult to understand. |
866 | | // In alter task v2, FE will call BE to create tablet and send an alter task to BE to convert historical data. |
867 | | // The admin should upgrade all BE and then upgrade FE. |
868 | | // Should delete the old code after upgrade finished. |
869 | 0 | Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& request) { |
870 | 0 | DBUG_EXECUTE_IF("SchemaChangeJob._do_process_alter_tablet.sleep", { sleep(10); }) |
871 | 0 | Status res; |
872 | 0 | signal::tablet_id = _base_tablet->get_table_id(); |
873 | | |
874 | | // check if tablet's state is not_ready, if it is ready, it means the tablet already finished |
875 | | // check whether the tablet's max continuous version == request.version |
876 | 0 | if (_new_tablet->tablet_state() != TABLET_NOTREADY) { |
877 | 0 | res = _validate_alter_result(request); |
878 | 0 | LOG(INFO) << "tablet's state=" << _new_tablet->tablet_state() |
879 | 0 | << " the convert job already finished, check its version" |
880 | 0 | << " res=" << res; |
881 | 0 | return res; |
882 | 0 | } |
883 | 0 | _new_tablet->set_alter_failed(false); |
884 | 0 | Defer defer([this] { |
885 | | // if tablet state is not TABLET_RUNNING when return, indicates that alter has failed. |
886 | 0 | if (_new_tablet->tablet_state() != TABLET_RUNNING) { |
887 | 0 | _new_tablet->set_alter_failed(true); |
888 | 0 | } |
889 | 0 | }); |
890 | |
|
891 | 0 | LOG(INFO) << "finish to validate alter tablet request. begin to convert data from base tablet " |
892 | 0 | "to new tablet" |
893 | 0 | << " base_tablet=" << _base_tablet->tablet_id() |
894 | 0 | << " new_tablet=" << _new_tablet->tablet_id(); |
895 | |
|
896 | 0 | std::shared_lock base_migration_rlock(_base_tablet->get_migration_lock(), std::try_to_lock); |
897 | 0 | if (!base_migration_rlock.owns_lock()) { |
898 | 0 | return Status::Error<TRY_LOCK_FAILED>( |
899 | 0 | "SchemaChangeJob::_do_process_alter_tablet get lock failed"); |
900 | 0 | } |
901 | 0 | std::shared_lock new_migration_rlock(_new_tablet->get_migration_lock(), std::try_to_lock); |
902 | 0 | if (!new_migration_rlock.owns_lock()) { |
903 | 0 | return Status::Error<TRY_LOCK_FAILED>( |
904 | 0 | "SchemaChangeJob::_do_process_alter_tablet get lock failed"); |
905 | 0 | } |
906 | | |
907 | 0 | std::vector<Version> versions_to_be_changed; |
908 | 0 | int64_t end_version = -1; |
909 | | // reader_context is stack variables, it's lifetime should keep the same |
910 | | // with rs_readers |
911 | 0 | RowsetReaderContext reader_context; |
912 | 0 | std::vector<RowSetSplits> rs_splits; |
913 | | // delete handlers for new tablet |
914 | 0 | DeleteHandler delete_handler; |
915 | 0 | std::vector<ColumnId> return_columns; |
916 | | |
917 | | // Use tablet schema directly from base tablet, they are the newest schema, not contain |
918 | | // dropped column during light weight schema change. |
919 | | // But the tablet schema in base tablet maybe not the latest from FE, so that if fe pass through |
920 | | // a tablet schema, then use request schema. |
921 | | // |
922 | | // return_columns does NOT include dropped columns. It is computed here BEFORE |
923 | | // merge_dropped_columns() appends dropped columns to _base_tablet_schema below. |
924 | | // This means return_columns only covers the original (non-dropped) columns. |
925 | | // |
926 | | // This is important because: |
927 | | // - BetaRowsetReader builds _output_schema from return_columns, which determines the |
928 | | // number of columns in ref_block (via create_block() which also skips dropped cols). |
929 | | // - VMergeIterator's copy_rows iterates over _output_schema columns, so ref_block |
930 | | // must match _output_schema exactly. |
931 | | // - Dropped columns are only needed for delete predicate evaluation, and SegmentIterator |
932 | | // handles them internally (creates temporary columns for predicate columns not present |
933 | | // in the block via `i >= block->columns()` guard in _init_current_block). |
934 | | // |
935 | | // Example: table has columns [k1, v1, v2], then DROP COLUMN v1, then |
936 | | // DELETE FROM t WHERE v1 = 'x' was issued before the drop. |
937 | | // - _base_tablet_schema after merge_dropped_columns: [k1, v2, v1(DROPPED)] |
938 | | // - return_columns (computed before merge): [0, 1] → [k1, v2] |
939 | | // - _output_schema / ref_block columns: [k1, v2] (2 columns) |
940 | | // - SegmentIterator reads v1 internally for delete predicate, but does not |
941 | | // output it to ref_block. copy_rows only iterates 2 columns — no OOB access. |
942 | 0 | size_t num_cols = |
943 | 0 | request.columns.empty() ? _base_tablet_schema->num_columns() : request.columns.size(); |
944 | 0 | return_columns.resize(num_cols); |
945 | 0 | for (int i = 0; i < num_cols; ++i) { |
946 | 0 | return_columns[i] = i; |
947 | 0 | } |
948 | 0 | std::vector<uint32_t> cluster_key_idxes; |
949 | |
|
950 | 0 | DBUG_EXECUTE_IF("SchemaChangeJob::_do_process_alter_tablet.block", DBUG_BLOCK); |
951 | | |
952 | | // begin to find deltas to convert from base tablet to new tablet so that |
953 | | // obtain base tablet and new tablet's push lock and header write lock to prevent loading data |
954 | 0 | { |
955 | 0 | std::lock_guard base_tablet_lock(_base_tablet->get_push_lock()); |
956 | 0 | std::lock_guard new_tablet_lock(_new_tablet->get_push_lock()); |
957 | 0 | std::lock_guard base_tablet_wlock(_base_tablet->get_header_lock()); |
958 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
959 | 0 | std::lock_guard<std::shared_mutex> new_tablet_wlock(_new_tablet->get_header_lock()); |
960 | |
|
961 | 0 | do { |
962 | 0 | RowsetSharedPtr max_rowset; |
963 | | // get history data to be converted and it will check if there is hold in base tablet |
964 | 0 | res = _get_versions_to_be_changed(&versions_to_be_changed, &max_rowset); |
965 | 0 | if (!res) { |
966 | 0 | LOG(WARNING) << "fail to get version to be changed. res=" << res; |
967 | 0 | break; |
968 | 0 | } |
969 | | |
970 | 0 | DBUG_EXECUTE_IF("SchemaChangeJob.process_alter_tablet.alter_fail", { |
971 | 0 | res = Status::InternalError( |
972 | 0 | "inject alter tablet failed. base_tablet={}, new_tablet={}", |
973 | 0 | request.base_tablet_id, request.new_tablet_id); |
974 | 0 | LOG(WARNING) << "inject error. res=" << res; |
975 | 0 | break; |
976 | 0 | }); |
977 | | |
978 | | // should check the max_version >= request.alter_version, if not the convert is useless |
979 | 0 | if (max_rowset == nullptr || max_rowset->end_version() < request.alter_version) { |
980 | 0 | res = Status::InternalError( |
981 | 0 | "base tablet's max version={} is less than request version={}", |
982 | 0 | (max_rowset == nullptr ? 0 : max_rowset->end_version()), |
983 | 0 | request.alter_version); |
984 | 0 | break; |
985 | 0 | } |
986 | | // before calculating version_to_be_changed, |
987 | | // remove all data from new tablet, prevent to rewrite data(those double pushed when wait) |
988 | 0 | LOG(INFO) << "begin to remove all data before end version from new tablet to prevent " |
989 | 0 | "rewrite." |
990 | 0 | << " new_tablet=" << _new_tablet->tablet_id() |
991 | 0 | << ", end_version=" << max_rowset->end_version(); |
992 | 0 | std::vector<RowsetSharedPtr> rowsets_to_delete; |
993 | 0 | std::vector<std::pair<Version, RowsetSharedPtr>> version_rowsets; |
994 | 0 | _new_tablet->acquire_version_and_rowsets(&version_rowsets); |
995 | 0 | std::sort(version_rowsets.begin(), version_rowsets.end(), |
996 | 0 | [](const std::pair<Version, RowsetSharedPtr>& l, |
997 | 0 | const std::pair<Version, RowsetSharedPtr>& r) { |
998 | 0 | return l.first.first < r.first.first; |
999 | 0 | }); |
1000 | 0 | for (auto& pair : version_rowsets) { |
1001 | 0 | if (pair.first.second <= max_rowset->end_version()) { |
1002 | 0 | rowsets_to_delete.push_back(pair.second); |
1003 | 0 | } else if (pair.first.first <= max_rowset->end_version()) { |
1004 | | // If max version is [X-10] and new tablet has version [7-9][10-12], |
1005 | | // we only can remove [7-9] from new tablet. If we add [X-10] to new tablet, it will has version |
1006 | | // cross: [X-10] [10-12]. |
1007 | | // So, we should return OLAP_ERR_VERSION_ALREADY_MERGED for fast fail. |
1008 | 0 | return Status::Error<VERSION_ALREADY_MERGED>( |
1009 | 0 | "New tablet has a version {} crossing base tablet's max_version={}", |
1010 | 0 | pair.first.to_string(), max_rowset->end_version()); |
1011 | 0 | } |
1012 | 0 | } |
1013 | 0 | std::vector<RowsetSharedPtr> empty_vec; |
1014 | 0 | RETURN_IF_ERROR(_new_tablet->delete_rowsets(rowsets_to_delete, false)); |
1015 | | // inherit cumulative_layer_point from base_tablet |
1016 | | // check if new_tablet.ce_point > base_tablet.ce_point? |
1017 | 0 | _new_tablet->set_cumulative_layer_point(-1); |
1018 | | // save tablet meta |
1019 | 0 | _new_tablet->save_meta(); |
1020 | 0 | for (auto& rowset : rowsets_to_delete) { |
1021 | | // do not call rowset.remove directly, using gc thread to delete it |
1022 | 0 | _local_storage_engine.add_unused_rowset(rowset); |
1023 | 0 | } |
1024 | | |
1025 | | // init one delete handler |
1026 | 0 | for (auto& version : versions_to_be_changed) { |
1027 | 0 | end_version = std::max(end_version, version.second); |
1028 | 0 | } |
1029 | | |
1030 | | // acquire data sources correspond to history versions |
1031 | 0 | RETURN_IF_ERROR( |
1032 | 0 | _base_tablet->capture_rs_readers_unlocked(versions_to_be_changed, &rs_splits)); |
1033 | 0 | if (rs_splits.empty()) { |
1034 | 0 | res = Status::Error<ALTER_DELTA_DOES_NOT_EXISTS>( |
1035 | 0 | "fail to acquire all data sources. version_num={}, data_source_num={}", |
1036 | 0 | versions_to_be_changed.size(), rs_splits.size()); |
1037 | 0 | break; |
1038 | 0 | } |
1039 | 0 | std::vector<RowsetMetaSharedPtr> del_preds; |
1040 | 0 | for (auto&& split : rs_splits) { |
1041 | 0 | const auto& rs_meta = split.rs_reader->rowset()->rowset_meta(); |
1042 | 0 | if (!rs_meta->has_delete_predicate() || rs_meta->start_version() > end_version) { |
1043 | 0 | continue; |
1044 | 0 | } |
1045 | 0 | _base_tablet_schema->merge_dropped_columns(*rs_meta->tablet_schema()); |
1046 | 0 | del_preds.push_back(rs_meta); |
1047 | 0 | } |
1048 | 0 | res = delete_handler.init(_base_tablet_schema, del_preds, end_version); |
1049 | 0 | if (!res) { |
1050 | 0 | LOG(WARNING) << "init delete handler failed. base_tablet=" |
1051 | 0 | << _base_tablet->tablet_id() << ", end_version=" << end_version; |
1052 | 0 | break; |
1053 | 0 | } |
1054 | | |
1055 | 0 | reader_context.reader_type = ReaderType::READER_ALTER_TABLE; |
1056 | 0 | reader_context.tablet_schema = _base_tablet_schema; |
1057 | 0 | reader_context.need_ordered_result = true; |
1058 | 0 | reader_context.delete_handler = &delete_handler; |
1059 | 0 | reader_context.return_columns = &return_columns; |
1060 | 0 | reader_context.sequence_id_idx = reader_context.tablet_schema->sequence_col_idx(); |
1061 | 0 | reader_context.is_unique = _base_tablet->keys_type() == UNIQUE_KEYS; |
1062 | 0 | reader_context.batch_size = ALTER_TABLE_BATCH_SIZE; |
1063 | 0 | reader_context.delete_bitmap = _base_tablet->tablet_meta()->delete_bitmap_ptr(); |
1064 | 0 | reader_context.version = Version(0, end_version); |
1065 | 0 | if (!_base_tablet_schema->cluster_key_uids().empty()) { |
1066 | 0 | for (const auto& uid : _base_tablet_schema->cluster_key_uids()) { |
1067 | 0 | cluster_key_idxes.emplace_back(_base_tablet_schema->field_index(uid)); |
1068 | 0 | } |
1069 | 0 | reader_context.read_orderby_key_columns = &cluster_key_idxes; |
1070 | 0 | reader_context.is_unique = false; |
1071 | 0 | reader_context.sequence_id_idx = -1; |
1072 | 0 | } |
1073 | 0 | for (auto& rs_split : rs_splits) { |
1074 | 0 | res = rs_split.rs_reader->init(&reader_context); |
1075 | 0 | if (!res) { |
1076 | 0 | LOG(WARNING) << "failed to init rowset reader: " << _base_tablet->tablet_id(); |
1077 | 0 | break; |
1078 | 0 | } |
1079 | 0 | } |
1080 | 0 | } while (false); |
1081 | 0 | } |
1082 | | |
1083 | 0 | do { |
1084 | 0 | if (!res) { |
1085 | 0 | break; |
1086 | 0 | } |
1087 | 0 | SchemaChangeParams sc_params; |
1088 | |
|
1089 | 0 | if (request.__isset.query_globals && request.__isset.query_options) { |
1090 | 0 | sc_params.runtime_state = |
1091 | 0 | std::make_shared<RuntimeState>(request.query_options, request.query_globals); |
1092 | 0 | } else { |
1093 | | // for old version request compatibility |
1094 | 0 | sc_params.runtime_state = std::make_shared<RuntimeState>(); |
1095 | 0 | } |
1096 | |
|
1097 | 0 | RETURN_IF_ERROR( |
1098 | 0 | DescriptorTbl::create(&sc_params.pool, request.desc_tbl, &sc_params.desc_tbl)); |
1099 | 0 | sc_params.ref_rowset_readers.reserve(rs_splits.size()); |
1100 | 0 | for (RowSetSplits& split : rs_splits) { |
1101 | 0 | sc_params.ref_rowset_readers.emplace_back(split.rs_reader); |
1102 | 0 | } |
1103 | 0 | sc_params.delete_handler = &delete_handler; |
1104 | 0 | sc_params.be_exec_version = request.be_exec_version; |
1105 | 0 | DCHECK(request.__isset.alter_tablet_type); |
1106 | 0 | switch (request.alter_tablet_type) { |
1107 | 0 | case TAlterTabletType::SCHEMA_CHANGE: |
1108 | 0 | sc_params.alter_tablet_type = AlterTabletType::SCHEMA_CHANGE; |
1109 | 0 | break; |
1110 | 0 | case TAlterTabletType::ROLLUP: |
1111 | 0 | sc_params.alter_tablet_type = AlterTabletType::ROLLUP; |
1112 | 0 | break; |
1113 | 0 | case TAlterTabletType::MIGRATION: |
1114 | 0 | sc_params.alter_tablet_type = AlterTabletType::MIGRATION; |
1115 | 0 | break; |
1116 | 0 | } |
1117 | 0 | if (request.__isset.materialized_view_params) { |
1118 | 0 | for (auto item : request.materialized_view_params) { |
1119 | 0 | AlterMaterializedViewParam mv_param; |
1120 | 0 | mv_param.column_name = item.column_name; |
1121 | |
|
1122 | 0 | if (item.__isset.mv_expr) { |
1123 | 0 | mv_param.expr = std::make_shared<TExpr>(item.mv_expr); |
1124 | 0 | } |
1125 | 0 | sc_params.materialized_params_map.insert( |
1126 | 0 | std::make_pair(to_lower(item.column_name), mv_param)); |
1127 | 0 | } |
1128 | 0 | } |
1129 | 0 | { |
1130 | 0 | std::lock_guard<std::shared_mutex> wrlock(_mutex); |
1131 | 0 | _tablet_ids_in_converting.insert(_new_tablet->tablet_id()); |
1132 | 0 | } |
1133 | 0 | int64_t real_alter_version = 0; |
1134 | 0 | sc_params.enable_unique_key_merge_on_write = |
1135 | 0 | _new_tablet->enable_unique_key_merge_on_write(); |
1136 | 0 | res = _convert_historical_rowsets(sc_params, &real_alter_version); |
1137 | 0 | { |
1138 | 0 | std::lock_guard<std::shared_mutex> wrlock(_mutex); |
1139 | 0 | _tablet_ids_in_converting.erase(_new_tablet->tablet_id()); |
1140 | 0 | } |
1141 | 0 | if (!res) { |
1142 | 0 | break; |
1143 | 0 | } |
1144 | | |
1145 | 0 | DCHECK_GE(real_alter_version, request.alter_version); |
1146 | |
|
1147 | 0 | if (_new_tablet->keys_type() == UNIQUE_KEYS && |
1148 | 0 | _new_tablet->enable_unique_key_merge_on_write()) { |
1149 | 0 | res = _calc_delete_bitmap_for_mow_table(real_alter_version); |
1150 | 0 | if (!res) { |
1151 | 0 | break; |
1152 | 0 | } |
1153 | 0 | } else { |
1154 | | // set state to ready |
1155 | 0 | std::lock_guard<std::shared_mutex> new_wlock(_new_tablet->get_header_lock()); |
1156 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
1157 | 0 | res = _new_tablet->set_tablet_state(TabletState::TABLET_RUNNING); |
1158 | 0 | if (!res) { |
1159 | 0 | break; |
1160 | 0 | } |
1161 | 0 | _new_tablet->save_meta(); |
1162 | 0 | } |
1163 | 0 | } while (false); |
1164 | | |
1165 | 0 | if (res) { |
1166 | | // _validate_alter_result should be outside the above while loop. |
1167 | | // to avoid requiring the header lock twice. |
1168 | 0 | res = _validate_alter_result(request); |
1169 | 0 | } |
1170 | | |
1171 | | // if failed convert history data, then just remove the new tablet |
1172 | 0 | if (!res) { |
1173 | 0 | LOG(WARNING) << "failed to alter tablet. base_tablet=" << _base_tablet->tablet_id() |
1174 | 0 | << ", drop new_tablet=" << _new_tablet->tablet_id(); |
1175 | | // do not drop the new tablet and its data. GC thread will |
1176 | 0 | } |
1177 | |
|
1178 | 0 | return res; |
1179 | 0 | } |
1180 | | |
1181 | 0 | bool SchemaChangeJob::tablet_in_converting(int64_t tablet_id) { |
1182 | 0 | std::shared_lock rdlock(_mutex); |
1183 | 0 | return _tablet_ids_in_converting.find(tablet_id) != _tablet_ids_in_converting.end(); |
1184 | 0 | } |
1185 | | |
1186 | | Status SchemaChangeJob::_get_versions_to_be_changed(std::vector<Version>* versions_to_be_changed, |
1187 | 0 | RowsetSharedPtr* max_rowset) { |
1188 | 0 | RowsetSharedPtr rowset = _base_tablet->get_rowset_with_max_version(); |
1189 | 0 | if (rowset == nullptr) { |
1190 | 0 | return Status::Error<ALTER_DELTA_DOES_NOT_EXISTS>("Tablet has no version. base_tablet={}", |
1191 | 0 | _base_tablet->tablet_id()); |
1192 | 0 | } |
1193 | 0 | *max_rowset = rowset; |
1194 | |
|
1195 | 0 | *versions_to_be_changed = DORIS_TRY(_base_tablet->capture_consistent_versions_unlocked( |
1196 | 0 | Version(0, rowset->version().second), {})); |
1197 | 0 | return Status::OK(); |
1198 | 0 | } |
1199 | | |
1200 | | // The `real_alter_version` parameter indicates that the version of [0-real_alter_version] is |
1201 | | // converted from a base tablet, only used for the mow table now. |
1202 | | Status SchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParams& sc_params, |
1203 | 0 | int64_t* real_alter_version) { |
1204 | 0 | LOG(INFO) << "begin to convert historical rowsets for new_tablet from base_tablet." |
1205 | 0 | << " base_tablet=" << _base_tablet->tablet_id() |
1206 | 0 | << ", new_tablet=" << _new_tablet->tablet_id() << ", job_id=" << _job_id; |
1207 | | |
1208 | | // find end version |
1209 | 0 | int64_t end_version = -1; |
1210 | 0 | for (const auto& ref_rowset_reader : sc_params.ref_rowset_readers) { |
1211 | 0 | if (ref_rowset_reader->version().second > end_version) { |
1212 | 0 | end_version = ref_rowset_reader->version().second; |
1213 | 0 | } |
1214 | 0 | } |
1215 | | |
1216 | | // Add filter information in change, and filter column information will be set in parse_request |
1217 | | // And filter some data every time the row block changes |
1218 | 0 | BlockChanger changer(_new_tablet_schema, *sc_params.desc_tbl, sc_params.runtime_state); |
1219 | |
|
1220 | 0 | bool sc_sorting = false; |
1221 | 0 | bool sc_directly = false; |
1222 | | |
1223 | | // a.Parse the Alter request and convert it into an internal representation |
1224 | 0 | Status res = parse_request(sc_params, _base_tablet_schema.get(), _new_tablet_schema.get(), |
1225 | 0 | &changer, &sc_sorting, &sc_directly); |
1226 | 0 | LOG(INFO) << "schema change type, sc_sorting: " << sc_sorting |
1227 | 0 | << ", sc_directly: " << sc_directly << ", base_tablet=" << _base_tablet->tablet_id() |
1228 | 0 | << ", new_tablet=" << _new_tablet->tablet_id(); |
1229 | |
|
1230 | 0 | auto process_alter_exit = [&]() -> Status { |
1231 | 0 | { |
1232 | | // save tablet meta here because rowset meta is not saved during add rowset |
1233 | 0 | std::lock_guard new_wlock(_new_tablet->get_header_lock()); |
1234 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
1235 | 0 | _new_tablet->save_meta(); |
1236 | 0 | } |
1237 | 0 | if (res) { |
1238 | 0 | Version test_version(0, end_version); |
1239 | 0 | res = _new_tablet->check_version_integrity(test_version); |
1240 | 0 | } |
1241 | |
|
1242 | 0 | LOG(INFO) << "finish converting rowsets for new_tablet from base_tablet. " |
1243 | 0 | << "base_tablet=" << _base_tablet->tablet_id() |
1244 | 0 | << ", new_tablet=" << _new_tablet->tablet_id(); |
1245 | 0 | return res; |
1246 | 0 | }; |
1247 | |
|
1248 | 0 | if (!res) { |
1249 | 0 | LOG(WARNING) << "failed to parse the request. res=" << res; |
1250 | 0 | return process_alter_exit(); |
1251 | 0 | } |
1252 | | |
1253 | 0 | if (!sc_sorting && !sc_directly && sc_params.alter_tablet_type == AlterTabletType::ROLLUP) { |
1254 | 0 | res = Status::Error<SCHEMA_SCHEMA_INVALID>( |
1255 | 0 | "Don't support to add materialized view by linked schema change"); |
1256 | 0 | return process_alter_exit(); |
1257 | 0 | } |
1258 | | |
1259 | | // b. Generate historical data converter |
1260 | 0 | auto sc_procedure = _get_sc_procedure( |
1261 | 0 | changer, sc_sorting, sc_directly, |
1262 | 0 | _local_storage_engine.memory_limitation_bytes_per_thread_for_schema_change()); |
1263 | |
|
1264 | 0 | DBUG_EXECUTE_IF("SchemaChangeJob::_convert_historical_rowsets.block", DBUG_BLOCK); |
1265 | | |
1266 | | // c.Convert historical data |
1267 | 0 | bool have_failure_rowset = false; |
1268 | 0 | for (const auto& rs_reader : sc_params.ref_rowset_readers) { |
1269 | | // set status for monitor |
1270 | | // As long as there is a new_table as running, ref table is set as running |
1271 | | // NOTE If the first sub_table fails first, it will continue to go as normal here |
1272 | | // When tablet create new rowset writer, it may change rowset type, in this case |
1273 | | // linked schema change will not be used. |
1274 | 0 | RowsetWriterContext context; |
1275 | 0 | context.version = rs_reader->version(); |
1276 | 0 | context.rowset_state = VISIBLE; |
1277 | 0 | context.segments_overlap = rs_reader->rowset()->rowset_meta()->segments_overlap(); |
1278 | 0 | context.tablet_schema = _new_tablet_schema; |
1279 | 0 | context.newest_write_timestamp = rs_reader->newest_write_timestamp(); |
1280 | 0 | context.allow_packed_file = false; |
1281 | |
|
1282 | 0 | if (!rs_reader->rowset()->is_local()) { |
1283 | 0 | auto maybe_resource = rs_reader->rowset()->rowset_meta()->remote_storage_resource(); |
1284 | 0 | if (!maybe_resource) { |
1285 | 0 | return maybe_resource.error(); |
1286 | 0 | } |
1287 | 0 | context.storage_resource = *maybe_resource.value(); |
1288 | 0 | } |
1289 | | |
1290 | 0 | context.write_type = DataWriteType::TYPE_SCHEMA_CHANGE; |
1291 | | // TODO if support VerticalSegmentWriter, also need to handle cluster key primary key index |
1292 | 0 | bool vertical = false; |
1293 | 0 | if (sc_sorting && !_new_tablet->tablet_schema()->cluster_key_uids().empty()) { |
1294 | | // see VBaseSchemaChangeWithSorting::_external_sorting |
1295 | 0 | vertical = true; |
1296 | 0 | } |
1297 | 0 | auto result = _new_tablet->create_rowset_writer(context, vertical); |
1298 | 0 | if (!result.has_value()) { |
1299 | 0 | res = Status::Error<ROWSET_BUILDER_INIT>("create_rowset_writer failed, reason={}", |
1300 | 0 | result.error().to_string()); |
1301 | 0 | return process_alter_exit(); |
1302 | 0 | } |
1303 | 0 | auto rowset_writer = std::move(result).value(); |
1304 | 0 | auto pending_rs_guard = _local_storage_engine.add_pending_rowset(context); |
1305 | |
|
1306 | 0 | if (res = sc_procedure->process(rs_reader, rowset_writer.get(), _new_tablet, _base_tablet, |
1307 | 0 | _base_tablet_schema, _new_tablet_schema); |
1308 | 0 | !res) { |
1309 | 0 | LOG(WARNING) << "failed to process the version." |
1310 | 0 | << " version=" << rs_reader->version().first << "-" |
1311 | 0 | << rs_reader->version().second << ", " << res.to_string(); |
1312 | 0 | return process_alter_exit(); |
1313 | 0 | } |
1314 | | // Add the new version of the data to the header |
1315 | | // In order to prevent the occurrence of deadlock, we must first lock the old table, and then lock the new table |
1316 | 0 | std::lock_guard lock(_new_tablet->get_push_lock()); |
1317 | 0 | RowsetSharedPtr new_rowset; |
1318 | 0 | if (!(res = rowset_writer->build(new_rowset)).ok()) { |
1319 | 0 | LOG(WARNING) << "failed to build rowset, exit alter process"; |
1320 | 0 | return process_alter_exit(); |
1321 | 0 | } |
1322 | 0 | res = _new_tablet->add_rowset(new_rowset); |
1323 | 0 | if (res.is<PUSH_VERSION_ALREADY_EXIST>()) { |
1324 | 0 | LOG(WARNING) << "version already exist, version revert occurred. " |
1325 | 0 | << "tablet=" << _new_tablet->tablet_id() << ", version='" |
1326 | 0 | << rs_reader->version().first << "-" << rs_reader->version().second; |
1327 | 0 | _local_storage_engine.add_unused_rowset(new_rowset); |
1328 | 0 | have_failure_rowset = true; |
1329 | 0 | res = Status::OK(); |
1330 | 0 | } else if (!res) { |
1331 | 0 | LOG(WARNING) << "failed to register new version. " |
1332 | 0 | << " tablet=" << _new_tablet->tablet_id() |
1333 | 0 | << ", version=" << rs_reader->version().first << "-" |
1334 | 0 | << rs_reader->version().second; |
1335 | 0 | _local_storage_engine.add_unused_rowset(new_rowset); |
1336 | 0 | return process_alter_exit(); |
1337 | 0 | } else { |
1338 | 0 | VLOG_NOTICE << "register new version. tablet=" << _new_tablet->tablet_id() |
1339 | 0 | << ", version=" << rs_reader->version().first << "-" |
1340 | 0 | << rs_reader->version().second; |
1341 | 0 | } |
1342 | 0 | if (!have_failure_rowset) { |
1343 | 0 | *real_alter_version = rs_reader->version().second; |
1344 | 0 | } |
1345 | |
|
1346 | 0 | VLOG_TRACE << "succeed to convert a history version." |
1347 | 0 | << " version=" << rs_reader->version().first << "-" |
1348 | 0 | << rs_reader->version().second; |
1349 | 0 | } |
1350 | | |
1351 | | // XXX:The SchemaChange state should not be canceled at this time, because the new Delta has to be converted to the old and new Schema version |
1352 | 0 | return process_alter_exit(); |
1353 | 0 | } |
1354 | | |
1355 | | static const std::string WHERE_SIGN_LOWER = to_lower("__DORIS_WHERE_SIGN__"); |
1356 | | |
1357 | | // @static |
1358 | | // Analyze the mapping of the column and the mapping of the filter key |
1359 | | Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, |
1360 | | TabletSchema* base_tablet_schema, |
1361 | | TabletSchema* new_tablet_schema, BlockChanger* changer, |
1362 | 0 | bool* sc_sorting, bool* sc_directly) { |
1363 | 0 | changer->set_type(sc_params.alter_tablet_type); |
1364 | 0 | changer->set_compatible_version(sc_params.be_exec_version); |
1365 | |
|
1366 | 0 | const std::unordered_map<std::string, AlterMaterializedViewParam>& materialized_function_map = |
1367 | 0 | sc_params.materialized_params_map; |
1368 | 0 | DescriptorTbl desc_tbl = *sc_params.desc_tbl; |
1369 | | |
1370 | | // set column mapping |
1371 | 0 | for (size_t i = 0, new_schema_size = new_tablet_schema->num_columns(); i < new_schema_size; |
1372 | 0 | ++i) { |
1373 | 0 | const TabletColumn& new_column = new_tablet_schema->column(i); |
1374 | 0 | const std::string& column_name_lower = to_lower(new_column.name()); |
1375 | 0 | ColumnMapping* column_mapping = changer->get_mutable_column_mapping(i); |
1376 | 0 | column_mapping->new_column = &new_column; |
1377 | |
|
1378 | 0 | column_mapping->ref_column_idx = base_tablet_schema->field_index(new_column.name()); |
1379 | |
|
1380 | 0 | if (materialized_function_map.find(column_name_lower) != materialized_function_map.end()) { |
1381 | 0 | auto mv_param = materialized_function_map.find(column_name_lower)->second; |
1382 | 0 | column_mapping->expr = mv_param.expr; |
1383 | 0 | if (column_mapping->expr != nullptr) { |
1384 | 0 | continue; |
1385 | 0 | } |
1386 | 0 | } |
1387 | | |
1388 | 0 | if (column_mapping->ref_column_idx >= 0) { |
1389 | 0 | continue; |
1390 | 0 | } |
1391 | | |
1392 | 0 | if (sc_params.alter_tablet_type == ROLLUP) { |
1393 | 0 | std::string materialized_function_map_str; |
1394 | 0 | for (auto str : materialized_function_map) { |
1395 | 0 | if (!materialized_function_map_str.empty()) { |
1396 | 0 | materialized_function_map_str += ','; |
1397 | 0 | } |
1398 | 0 | materialized_function_map_str += str.first; |
1399 | 0 | } |
1400 | 0 | return Status::InternalError( |
1401 | 0 | "referenced column was missing. [column={},materialized_function_map={}]", |
1402 | 0 | new_column.name(), materialized_function_map_str); |
1403 | 0 | } |
1404 | | |
1405 | 0 | if (new_column.name().find("__doris_shadow_") == 0) { |
1406 | | // Should delete in the future, just a protection for bug. |
1407 | 0 | LOG(INFO) << "a shadow column is encountered " << new_column.name(); |
1408 | 0 | return Status::InternalError("failed due to operate on shadow column"); |
1409 | 0 | } |
1410 | | // Newly added column go here |
1411 | 0 | column_mapping->ref_column_idx = -1; |
1412 | |
|
1413 | 0 | if (i < base_tablet_schema->num_short_key_columns()) { |
1414 | 0 | *sc_directly = true; |
1415 | 0 | } |
1416 | 0 | RETURN_IF_ERROR( |
1417 | 0 | _init_column_mapping(column_mapping, new_column, new_column.default_value())); |
1418 | | |
1419 | 0 | LOG(INFO) << "A column with default value will be added after schema changing. " |
1420 | 0 | << "column=" << new_column.name() |
1421 | 0 | << ", default_value=" << new_column.default_value(); |
1422 | 0 | } |
1423 | | |
1424 | 0 | if (materialized_function_map.contains(WHERE_SIGN_LOWER)) { |
1425 | 0 | changer->set_where_expr(materialized_function_map.find(WHERE_SIGN_LOWER)->second.expr); |
1426 | 0 | } |
1427 | | |
1428 | | // If the reference sequence of the Key column is out of order, it needs to be reordered |
1429 | 0 | int num_default_value = 0; |
1430 | |
|
1431 | 0 | for (int i = 0, new_schema_size = cast_set<int>(new_tablet_schema->num_key_columns()); |
1432 | 0 | i < new_schema_size; ++i) { |
1433 | 0 | ColumnMapping* column_mapping = changer->get_mutable_column_mapping(i); |
1434 | |
|
1435 | 0 | if (!column_mapping->has_reference()) { |
1436 | 0 | num_default_value++; |
1437 | 0 | continue; |
1438 | 0 | } |
1439 | | |
1440 | 0 | if (column_mapping->ref_column_idx != i - num_default_value) { |
1441 | 0 | *sc_sorting = true; |
1442 | 0 | return Status::OK(); |
1443 | 0 | } |
1444 | 0 | } |
1445 | | |
1446 | 0 | if (base_tablet_schema->keys_type() != new_tablet_schema->keys_type()) { |
1447 | | // only when base table is dup and mv is agg |
1448 | | // the rollup job must be reagg. |
1449 | 0 | *sc_sorting = true; |
1450 | 0 | return Status::OK(); |
1451 | 0 | } |
1452 | | |
1453 | | // If the sort of key has not been changed but the new keys num is less then base's, |
1454 | | // the new table should be re agg. |
1455 | | // So we also need to set sc_sorting = true. |
1456 | | // A, B, C are keys(sort keys), D is value |
1457 | | // followings need resort: |
1458 | | // old keys: A B C D |
1459 | | // new keys: A B |
1460 | 0 | if (new_tablet_schema->keys_type() != KeysType::DUP_KEYS && |
1461 | 0 | new_tablet_schema->num_key_columns() < base_tablet_schema->num_key_columns()) { |
1462 | | // this is a table with aggregate key type, and num of key columns in new schema |
1463 | | // is less, which means the data in new tablet should be more aggregated. |
1464 | | // so we use sorting schema change to sort and merge the data. |
1465 | 0 | *sc_sorting = true; |
1466 | 0 | return Status::OK(); |
1467 | 0 | } |
1468 | | |
1469 | 0 | if (sc_params.alter_tablet_type == ROLLUP) { |
1470 | 0 | *sc_directly = true; |
1471 | 0 | return Status::OK(); |
1472 | 0 | } |
1473 | | |
1474 | 0 | if (sc_params.enable_unique_key_merge_on_write && |
1475 | 0 | new_tablet_schema->num_key_columns() > base_tablet_schema->num_key_columns()) { |
1476 | 0 | *sc_directly = true; |
1477 | 0 | return Status::OK(); |
1478 | 0 | } |
1479 | | |
1480 | 0 | if (base_tablet_schema->num_short_key_columns() != new_tablet_schema->num_short_key_columns()) { |
1481 | | // the number of short_keys changed, can't do linked schema change |
1482 | 0 | *sc_directly = true; |
1483 | 0 | return Status::OK(); |
1484 | 0 | } |
1485 | | |
1486 | 0 | if (!sc_params.delete_handler->empty()) { |
1487 | | // there exists delete condition in header, can't do linked schema change |
1488 | 0 | *sc_directly = true; |
1489 | 0 | return Status::OK(); |
1490 | 0 | } |
1491 | | |
1492 | | // if new tablet enable row store, or new tablet has different row store columns |
1493 | 0 | if ((!base_tablet_schema->exist_column(BeConsts::ROW_STORE_COL) && |
1494 | 0 | new_tablet_schema->exist_column(BeConsts::ROW_STORE_COL)) || |
1495 | 0 | !std::equal(new_tablet_schema->row_columns_uids().begin(), |
1496 | 0 | new_tablet_schema->row_columns_uids().end(), |
1497 | 0 | base_tablet_schema->row_columns_uids().begin(), |
1498 | 0 | base_tablet_schema->row_columns_uids().end())) { |
1499 | 0 | *sc_directly = true; |
1500 | 0 | } |
1501 | |
|
1502 | 0 | for (size_t i = 0; i < new_tablet_schema->num_columns(); ++i) { |
1503 | 0 | ColumnMapping* column_mapping = changer->get_mutable_column_mapping(i); |
1504 | 0 | if (column_mapping->expr != nullptr) { |
1505 | 0 | *sc_directly = true; |
1506 | 0 | return Status::OK(); |
1507 | 0 | } else if (column_mapping->ref_column_idx >= 0) { |
1508 | | // index changed |
1509 | 0 | if (variant_util::has_schema_index_diff(new_tablet_schema, base_tablet_schema, |
1510 | 0 | cast_set<int32_t>(i), |
1511 | 0 | column_mapping->ref_column_idx)) { |
1512 | 0 | *sc_directly = true; |
1513 | 0 | return Status::OK(); |
1514 | 0 | } |
1515 | 0 | } |
1516 | 0 | } |
1517 | | |
1518 | | // if rs_reader has remote files, link schema change is not supported, |
1519 | | // use directly schema change instead. |
1520 | 0 | if (!(*sc_directly) && !(*sc_sorting)) { |
1521 | | // check has remote rowset |
1522 | | // work for cloud and cold storage |
1523 | 0 | for (const auto& rs_reader : sc_params.ref_rowset_readers) { |
1524 | 0 | if (!rs_reader->rowset()->is_local()) { |
1525 | 0 | *sc_directly = true; |
1526 | 0 | break; |
1527 | 0 | } |
1528 | 0 | } |
1529 | 0 | } |
1530 | |
|
1531 | 0 | return Status::OK(); |
1532 | 0 | } |
1533 | | |
1534 | | Status SchemaChangeJob::_init_column_mapping(ColumnMapping* column_mapping, |
1535 | | const TabletColumn& column_schema, |
1536 | 0 | const std::string& value) { |
1537 | 0 | auto t = StorageFieldFactory::create(column_schema); |
1538 | 0 | Defer defer([t]() { delete t; }); |
1539 | 0 | if (t == nullptr) { |
1540 | 0 | return Status::Uninitialized("Unsupport field creation of {}", column_schema.name()); |
1541 | 0 | } |
1542 | | |
1543 | 0 | if (!column_schema.is_nullable() || value.length() != 0) { |
1544 | 0 | RETURN_IF_ERROR(column_schema.get_vec_type()->get_serde()->from_fe_string( |
1545 | 0 | value, column_mapping->default_value)); |
1546 | 0 | } |
1547 | | |
1548 | 0 | return Status::OK(); |
1549 | 0 | } |
1550 | | |
1551 | 0 | Status SchemaChangeJob::_validate_alter_result(const TAlterTabletReqV2& request) { |
1552 | 0 | Version max_continuous_version = {-1, 0}; |
1553 | 0 | _new_tablet->max_continuous_version_from_beginning(&max_continuous_version); |
1554 | 0 | LOG(INFO) << "find max continuous version of tablet=" << _new_tablet->tablet_id() |
1555 | 0 | << ", start_version=" << max_continuous_version.first |
1556 | 0 | << ", end_version=" << max_continuous_version.second; |
1557 | 0 | if (max_continuous_version.second < request.alter_version) { |
1558 | 0 | return Status::InternalError("result version={} is less than request version={}", |
1559 | 0 | max_continuous_version.second, request.alter_version); |
1560 | 0 | } |
1561 | | |
1562 | 0 | std::vector<std::pair<Version, RowsetSharedPtr>> version_rowsets; |
1563 | 0 | { |
1564 | 0 | std::shared_lock rdlock(_new_tablet->get_header_lock()); |
1565 | 0 | _new_tablet->acquire_version_and_rowsets(&version_rowsets); |
1566 | 0 | } |
1567 | 0 | for (auto& pair : version_rowsets) { |
1568 | 0 | RowsetSharedPtr rowset = pair.second; |
1569 | 0 | if (!rowset->check_file_exist()) { |
1570 | 0 | return Status::Error<NOT_FOUND>( |
1571 | 0 | "SchemaChangeJob::_validate_alter_result meet invalid rowset"); |
1572 | 0 | } |
1573 | 0 | } |
1574 | 0 | return Status::OK(); |
1575 | 0 | } |
1576 | | |
1577 | | // For unique with merge-on-write table, should process delete bitmap here. |
1578 | | // 1. During double write, the newly imported rowsets does not calculate |
1579 | | // delete bitmap and publish successfully. |
1580 | | // 2. After conversion, calculate delete bitmap for the rowsets imported |
1581 | | // during double write. During this period, new data can still be imported |
1582 | | // witout calculating delete bitmap and publish successfully. |
1583 | | // 3. Block the new publish, calculate the delete bitmap of the |
1584 | | // incremental rowsets. |
1585 | | // 4. Switch the tablet status to TABLET_RUNNING. The newly imported |
1586 | | // data will calculate delete bitmap. |
1587 | 0 | Status SchemaChangeJob::_calc_delete_bitmap_for_mow_table(int64_t alter_version) { |
1588 | 0 | DBUG_EXECUTE_IF("SchemaChangeJob._calc_delete_bitmap_for_mow_table.random_failed", { |
1589 | 0 | if (rand() % 100 < (100 * dp->param("percent", 0.1))) { |
1590 | 0 | LOG_WARNING("SchemaChangeJob._calc_delete_bitmap_for_mow_table.random_failed"); |
1591 | 0 | return Status::InternalError("debug schema change calc delete bitmap random failed"); |
1592 | 0 | } |
1593 | 0 | }); |
1594 | | |
1595 | | // can't do compaction when calc delete bitmap, if the rowset being calculated does |
1596 | | // a compaction, it may cause the delete bitmap to be missed. |
1597 | 0 | std::lock_guard base_compaction_lock(_new_tablet->get_base_compaction_lock()); |
1598 | 0 | std::lock_guard cumu_compaction_lock(_new_tablet->get_cumulative_compaction_lock()); |
1599 | | |
1600 | | // step 2 |
1601 | 0 | int64_t max_version = _new_tablet->max_version().second; |
1602 | 0 | std::vector<RowsetSharedPtr> rowsets; |
1603 | 0 | if (alter_version < max_version) { |
1604 | 0 | LOG(INFO) << "alter table for unique with merge-on-write, calculate delete bitmap of " |
1605 | 0 | << "double write rowsets for version: " << alter_version + 1 << "-" << max_version |
1606 | 0 | << " new_tablet=" << _new_tablet->tablet_id(); |
1607 | 0 | std::shared_lock rlock(_new_tablet->get_header_lock()); |
1608 | 0 | auto ret = DORIS_TRY(_new_tablet->capture_consistent_rowsets_unlocked( |
1609 | 0 | {alter_version + 1, max_version}, CaptureRowsetOps {})); |
1610 | 0 | rowsets = std::move(ret.rowsets); |
1611 | 0 | } |
1612 | 0 | for (auto rowset_ptr : rowsets) { |
1613 | 0 | std::lock_guard rwlock(_new_tablet->get_rowset_update_lock()); |
1614 | 0 | std::shared_lock rlock(_new_tablet->get_header_lock()); |
1615 | 0 | RETURN_IF_ERROR(Tablet::update_delete_bitmap_without_lock(_new_tablet, rowset_ptr)); |
1616 | 0 | } |
1617 | | |
1618 | | // step 3 |
1619 | 0 | std::lock_guard rwlock(_new_tablet->get_rowset_update_lock()); |
1620 | 0 | std::lock_guard new_wlock(_new_tablet->get_header_lock()); |
1621 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
1622 | 0 | int64_t new_max_version = _new_tablet->max_version_unlocked(); |
1623 | 0 | rowsets.clear(); |
1624 | 0 | if (max_version < new_max_version) { |
1625 | 0 | LOG(INFO) << "alter table for unique with merge-on-write, calculate delete bitmap of " |
1626 | 0 | << "incremental rowsets for version: " << max_version + 1 << "-" |
1627 | 0 | << new_max_version << " new_tablet=" << _new_tablet->tablet_id(); |
1628 | 0 | auto ret = DORIS_TRY(_new_tablet->capture_consistent_rowsets_unlocked( |
1629 | 0 | {max_version + 1, new_max_version}, CaptureRowsetOps {})); |
1630 | 0 | rowsets = std::move(ret.rowsets); |
1631 | 0 | } |
1632 | 0 | for (auto&& rowset_ptr : rowsets) { |
1633 | 0 | RETURN_IF_ERROR(Tablet::update_delete_bitmap_without_lock(_new_tablet, rowset_ptr)); |
1634 | 0 | } |
1635 | | // step 4 |
1636 | 0 | RETURN_IF_ERROR(_new_tablet->set_tablet_state(TabletState::TABLET_RUNNING)); |
1637 | 0 | _new_tablet->save_meta(); |
1638 | 0 | return Status::OK(); |
1639 | 0 | } |
1640 | | |
1641 | | } // namespace doris |