Coverage Report

Created: 2026-04-01 16:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/tablet/tablet_schema.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/tablet/tablet_schema.h"
19
20
#include <gen_cpp/Descriptors_types.h>
21
#include <gen_cpp/olap_file.pb.h>
22
#include <glog/logging.h>
23
#include <google/protobuf/io/coded_stream.h>
24
#include <google/protobuf/io/zero_copy_stream.h>
25
#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
26
27
#include <algorithm>
28
#include <cctype>
29
// IWYU pragma: no_include <bits/std_abs.h>
30
#include <cmath> // IWYU pragma: keep
31
#include <memory>
32
#include <ostream>
33
#include <vector>
34
35
#include "common/compiler_util.h" // IWYU pragma: keep
36
#include "common/consts.h"
37
#include "common/status.h"
38
#include "core/block/block.h"
39
#include "core/column/column_nothing.h"
40
#include "core/data_type/data_type.h"
41
#include "core/data_type/data_type_factory.hpp"
42
#include "core/string_ref.h"
43
#include "exec/common/hex.h"
44
#include "exprs/aggregate/aggregate_function_simple_factory.h"
45
#include "exprs/aggregate/aggregate_function_state_union.h"
46
#include "storage/index/inverted/analyzer/analyzer.h"
47
#include "storage/index/inverted/inverted_index_parser.h"
48
#include "storage/olap_common.h"
49
#include "storage/olap_define.h"
50
#include "storage/tablet/tablet_column_object_pool.h"
51
#include "storage/tablet/tablet_meta.h"
52
#include "storage/tablet_info.h"
53
#include "storage/types.h"
54
#include "storage/utils.h"
55
#include "util/json/path_in_data.h"
56
57
namespace doris {
58
#include "common/compile_check_begin.h"
59
7.48M
FieldType TabletColumn::get_field_type_by_type(PrimitiveType primitiveType) {
60
7.48M
    switch (primitiveType) {
61
0
    case PrimitiveType::INVALID_TYPE:
62
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN;
63
0
    case PrimitiveType::TYPE_NULL:
64
0
        return FieldType::OLAP_FIELD_TYPE_NONE;
65
248k
    case PrimitiveType::TYPE_BOOLEAN:
66
248k
        return FieldType::OLAP_FIELD_TYPE_BOOL;
67
492
    case PrimitiveType::TYPE_TINYINT:
68
492
        return FieldType::OLAP_FIELD_TYPE_TINYINT;
69
531
    case PrimitiveType::TYPE_SMALLINT:
70
531
        return FieldType::OLAP_FIELD_TYPE_SMALLINT;
71
134k
    case PrimitiveType::TYPE_INT:
72
134k
        return FieldType::OLAP_FIELD_TYPE_INT;
73
4.13M
    case PrimitiveType::TYPE_BIGINT:
74
4.13M
        return FieldType::OLAP_FIELD_TYPE_BIGINT;
75
90.6k
    case PrimitiveType::TYPE_LARGEINT:
76
90.6k
        return FieldType::OLAP_FIELD_TYPE_LARGEINT;
77
615
    case PrimitiveType::TYPE_FLOAT:
78
615
        return FieldType::OLAP_FIELD_TYPE_FLOAT;
79
2.71M
    case PrimitiveType::TYPE_DOUBLE:
80
2.71M
        return FieldType::OLAP_FIELD_TYPE_DOUBLE;
81
0
    case PrimitiveType::TYPE_VARCHAR:
82
0
        return FieldType::OLAP_FIELD_TYPE_VARCHAR;
83
0
    case PrimitiveType::TYPE_DATE:
84
0
        return FieldType::OLAP_FIELD_TYPE_DATE;
85
0
    case PrimitiveType::TYPE_DATETIME:
86
0
        return FieldType::OLAP_FIELD_TYPE_DATETIME;
87
0
    case PrimitiveType::TYPE_BINARY:
88
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN; // Not implemented
89
0
    case PrimitiveType::TYPE_CHAR:
90
0
        return FieldType::OLAP_FIELD_TYPE_CHAR;
91
0
    case PrimitiveType::TYPE_STRUCT:
92
0
        return FieldType::OLAP_FIELD_TYPE_STRUCT;
93
0
    case PrimitiveType::TYPE_ARRAY:
94
0
        return FieldType::OLAP_FIELD_TYPE_ARRAY;
95
0
    case PrimitiveType::TYPE_MAP:
96
0
        return FieldType::OLAP_FIELD_TYPE_MAP;
97
0
    case PrimitiveType::TYPE_HLL:
98
0
        return FieldType::OLAP_FIELD_TYPE_HLL;
99
0
    case PrimitiveType::TYPE_DECIMALV2:
100
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN; // Not implemented
101
0
    case PrimitiveType::TYPE_BITMAP:
102
0
        return FieldType::OLAP_FIELD_TYPE_BITMAP;
103
0
    case PrimitiveType::TYPE_STRING:
104
0
        return FieldType::OLAP_FIELD_TYPE_STRING;
105
0
    case PrimitiveType::TYPE_QUANTILE_STATE:
106
0
        return FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE;
107
0
    case PrimitiveType::TYPE_DATEV2:
108
0
        return FieldType::OLAP_FIELD_TYPE_DATEV2;
109
0
    case PrimitiveType::TYPE_DATETIMEV2:
110
0
        return FieldType::OLAP_FIELD_TYPE_DATETIMEV2;
111
0
    case PrimitiveType::TYPE_TIMESTAMPTZ:
112
0
        return FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ;
113
0
    case PrimitiveType::TYPE_TIMEV2:
114
0
        return FieldType::OLAP_FIELD_TYPE_TIMEV2;
115
626
    case PrimitiveType::TYPE_DECIMAL32:
116
626
        return FieldType::OLAP_FIELD_TYPE_DECIMAL32;
117
628
    case PrimitiveType::TYPE_DECIMAL64:
118
628
        return FieldType::OLAP_FIELD_TYPE_DECIMAL64;
119
270k
    case PrimitiveType::TYPE_DECIMAL128I:
120
270k
        return FieldType::OLAP_FIELD_TYPE_DECIMAL128I;
121
611
    case PrimitiveType::TYPE_DECIMAL256:
122
611
        return FieldType::OLAP_FIELD_TYPE_DECIMAL256;
123
0
    case PrimitiveType::TYPE_JSONB:
124
0
        return FieldType::OLAP_FIELD_TYPE_JSONB;
125
0
    case PrimitiveType::TYPE_VARIANT:
126
0
        return FieldType::OLAP_FIELD_TYPE_VARIANT;
127
0
    case PrimitiveType::TYPE_IPV4:
128
0
        return FieldType::OLAP_FIELD_TYPE_IPV4;
129
0
    case PrimitiveType::TYPE_IPV6:
130
0
        return FieldType::OLAP_FIELD_TYPE_IPV6;
131
0
    case PrimitiveType::TYPE_AGG_STATE:
132
0
        return FieldType::OLAP_FIELD_TYPE_AGG_STATE;
133
0
    default:
134
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN;
135
7.48M
    }
136
7.48M
}
137
138
16.6M
PrimitiveType TabletColumn::get_primitive_type_by_field_type(FieldType type) {
139
16.6M
    static const PrimitiveType mapping[] = {
140
16.6M
            /*  0 */ PrimitiveType::INVALID_TYPE,
141
16.6M
            /*  1 OLAP_FIELD_TYPE_TINYINT           */ PrimitiveType::TYPE_TINYINT,
142
16.6M
            /*  2 OLAP_FIELD_TYPE_UNSIGNED_TINYINT  */ PrimitiveType::INVALID_TYPE,
143
16.6M
            /*  3 OLAP_FIELD_TYPE_SMALLINT          */ PrimitiveType::TYPE_SMALLINT,
144
16.6M
            /*  4 OLAP_FIELD_TYPE_UNSIGNED_SMALLINT */ PrimitiveType::INVALID_TYPE,
145
16.6M
            /*  5 OLAP_FIELD_TYPE_INT               */ PrimitiveType::TYPE_INT,
146
16.6M
            /*  6 OLAP_FIELD_TYPE_UNSIGNED_INT      */ PrimitiveType::INVALID_TYPE,
147
16.6M
            /*  7 OLAP_FIELD_TYPE_BIGINT            */ PrimitiveType::TYPE_BIGINT,
148
16.6M
            /*  8 OLAP_FIELD_TYPE_UNSIGNED_BIGINT   */ PrimitiveType::INVALID_TYPE,
149
16.6M
            /*  9 OLAP_FIELD_TYPE_LARGEINT          */ PrimitiveType::TYPE_LARGEINT,
150
16.6M
            /* 10 OLAP_FIELD_TYPE_FLOAT             */ PrimitiveType::TYPE_FLOAT,
151
16.6M
            /* 11 OLAP_FIELD_TYPE_DOUBLE            */ PrimitiveType::TYPE_DOUBLE,
152
16.6M
            /* 12 OLAP_FIELD_TYPE_DISCRETE_DOUBLE   */ PrimitiveType::INVALID_TYPE,
153
16.6M
            /* 13 OLAP_FIELD_TYPE_CHAR              */ PrimitiveType::TYPE_CHAR,
154
16.6M
            /* 14 OLAP_FIELD_TYPE_DATE              */ PrimitiveType::TYPE_DATE,
155
16.6M
            /* 15 OLAP_FIELD_TYPE_DATETIME          */ PrimitiveType::TYPE_DATETIME,
156
16.6M
            /* 16 OLAP_FIELD_TYPE_DECIMAL           */ PrimitiveType::INVALID_TYPE,
157
16.6M
            /* 17 OLAP_FIELD_TYPE_VARCHAR           */ PrimitiveType::TYPE_VARCHAR,
158
16.6M
            /* 18 OLAP_FIELD_TYPE_STRUCT            */ PrimitiveType::TYPE_STRUCT,
159
16.6M
            /* 19 OLAP_FIELD_TYPE_ARRAY             */ PrimitiveType::TYPE_ARRAY,
160
16.6M
            /* 20 OLAP_FIELD_TYPE_MAP               */ PrimitiveType::TYPE_MAP,
161
16.6M
            /* 21 OLAP_FIELD_TYPE_UNKNOWN           */ PrimitiveType::INVALID_TYPE,
162
16.6M
            /* 22 OLAP_FIELD_TYPE_NONE              */ PrimitiveType::TYPE_NULL,
163
16.6M
            /* 23 OLAP_FIELD_TYPE_HLL               */ PrimitiveType::TYPE_HLL,
164
16.6M
            /* 24 OLAP_FIELD_TYPE_BOOL              */ PrimitiveType::TYPE_BOOLEAN,
165
16.6M
            /* 25 OLAP_FIELD_TYPE_BITMAP            */ PrimitiveType::TYPE_BITMAP,
166
16.6M
            /* 26 OLAP_FIELD_TYPE_STRING            */ PrimitiveType::TYPE_STRING,
167
16.6M
            /* 27 OLAP_FIELD_TYPE_QUANTILE_STATE    */ PrimitiveType::TYPE_QUANTILE_STATE,
168
16.6M
            /* 28 OLAP_FIELD_TYPE_DATEV2            */ PrimitiveType::TYPE_DATEV2,
169
16.6M
            /* 29 OLAP_FIELD_TYPE_DATETIMEV2        */ PrimitiveType::TYPE_DATETIMEV2,
170
16.6M
            /* 30 OLAP_FIELD_TYPE_TIMEV2            */ PrimitiveType::TYPE_TIMEV2,
171
16.6M
            /* 31 OLAP_FIELD_TYPE_DECIMAL32         */ PrimitiveType::TYPE_DECIMAL32,
172
16.6M
            /* 32 OLAP_FIELD_TYPE_DECIMAL64         */ PrimitiveType::TYPE_DECIMAL64,
173
16.6M
            /* 33 OLAP_FIELD_TYPE_DECIMAL128I       */ PrimitiveType::TYPE_DECIMAL128I,
174
16.6M
            /* 34 OLAP_FIELD_TYPE_JSONB             */ PrimitiveType::TYPE_JSONB,
175
16.6M
            /* 35 OLAP_FIELD_TYPE_VARIANT           */ PrimitiveType::TYPE_VARIANT,
176
16.6M
            /* 36 OLAP_FIELD_TYPE_AGG_STATE         */ PrimitiveType::TYPE_AGG_STATE,
177
16.6M
            /* 37 OLAP_FIELD_TYPE_DECIMAL256        */ PrimitiveType::TYPE_DECIMAL256,
178
16.6M
            /* 38 OLAP_FIELD_TYPE_IPV4              */ PrimitiveType::TYPE_IPV4,
179
16.6M
            /* 39 OLAP_FIELD_TYPE_IPV6              */ PrimitiveType::TYPE_IPV6,
180
16.6M
            /* 40 OLAP_FIELD_TYPE_TIMESTAMPTZ       */ PrimitiveType::TYPE_TIMESTAMPTZ,
181
16.6M
    };
182
183
16.6M
    int idx = static_cast<int>(type);
184
16.6M
    return mapping[idx];
185
16.6M
}
186
187
22.3M
FieldType TabletColumn::get_field_type_by_string(const std::string& type_str) {
188
22.3M
    std::string upper_type_str = type_str;
189
22.3M
    std::transform(type_str.begin(), type_str.end(), upper_type_str.begin(),
190
141M
                   [](auto c) { return std::toupper(c); });
191
22.3M
    FieldType type;
192
193
22.3M
    if (0 == upper_type_str.compare("TINYINT")) {
194
1.35M
        type = FieldType::OLAP_FIELD_TYPE_TINYINT;
195
21.0M
    } else if (0 == upper_type_str.compare("SMALLINT")) {
196
518k
        type = FieldType::OLAP_FIELD_TYPE_SMALLINT;
197
20.5M
    } else if (0 == upper_type_str.compare("INT")) {
198
2.84M
        type = FieldType::OLAP_FIELD_TYPE_INT;
199
17.6M
    } else if (0 == upper_type_str.compare("BIGINT")) {
200
2.33M
        type = FieldType::OLAP_FIELD_TYPE_BIGINT;
201
15.3M
    } else if (0 == upper_type_str.compare("LARGEINT")) {
202
562k
        type = FieldType::OLAP_FIELD_TYPE_LARGEINT;
203
14.7M
    } else if (0 == upper_type_str.compare("UNSIGNED_TINYINT")) {
204
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_TINYINT;
205
14.7M
    } else if (0 == upper_type_str.compare("UNSIGNED_SMALLINT")) {
206
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_SMALLINT;
207
14.7M
    } else if (0 == upper_type_str.compare("UNSIGNED_INT")) {
208
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT;
209
14.7M
    } else if (0 == upper_type_str.compare("UNSIGNED_BIGINT")) {
210
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
211
14.7M
    } else if (0 == upper_type_str.compare("IPV4")) {
212
30.6k
        type = FieldType::OLAP_FIELD_TYPE_IPV4;
213
14.7M
    } else if (0 == upper_type_str.compare("IPV6")) {
214
30.9k
        type = FieldType::OLAP_FIELD_TYPE_IPV6;
215
14.7M
    } else if (0 == upper_type_str.compare("FLOAT")) {
216
444k
        type = FieldType::OLAP_FIELD_TYPE_FLOAT;
217
14.2M
    } else if (0 == upper_type_str.compare("DISCRETE_DOUBLE")) {
218
0
        type = FieldType::OLAP_FIELD_TYPE_DISCRETE_DOUBLE;
219
14.2M
    } else if (0 == upper_type_str.compare("DOUBLE")) {
220
596k
        type = FieldType::OLAP_FIELD_TYPE_DOUBLE;
221
13.6M
    } else if (0 == upper_type_str.compare("CHAR")) {
222
575k
        type = FieldType::OLAP_FIELD_TYPE_CHAR;
223
13.0M
    } else if (0 == upper_type_str.compare("DATE")) {
224
4.40k
        type = FieldType::OLAP_FIELD_TYPE_DATE;
225
13.0M
    } else if (0 == upper_type_str.compare("DATEV2")) {
226
1.55M
        type = FieldType::OLAP_FIELD_TYPE_DATEV2;
227
11.5M
    } else if (0 == upper_type_str.compare("DATETIMEV2")) {
228
1.28M
        type = FieldType::OLAP_FIELD_TYPE_DATETIMEV2;
229
10.2M
    } else if (0 == upper_type_str.compare("DATETIME")) {
230
8.21k
        type = FieldType::OLAP_FIELD_TYPE_DATETIME;
231
10.2M
    } else if (0 == upper_type_str.compare("TIMESTAMPTZ")) {
232
83.2k
        type = FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ;
233
10.1M
    } else if (0 == upper_type_str.compare("DECIMAL32")) {
234
409k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL32;
235
9.74M
    } else if (0 == upper_type_str.compare("DECIMAL64")) {
236
851k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL64;
237
8.89M
    } else if (0 == upper_type_str.compare("DECIMAL128I")) {
238
603k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL128I;
239
8.28M
    } else if (0 == upper_type_str.compare("DECIMAL256")) {
240
69.2k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL256;
241
8.21M
    } else if (0 == upper_type_str.compare(0, 7, "DECIMAL")) {
242
24.4k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL;
243
8.19M
    } else if (0 == upper_type_str.compare(0, 7, "VARCHAR")) {
244
4.06M
        type = FieldType::OLAP_FIELD_TYPE_VARCHAR;
245
4.13M
    } else if (0 == upper_type_str.compare("STRING")) {
246
960k
        type = FieldType::OLAP_FIELD_TYPE_STRING;
247
3.17M
    } else if (0 == upper_type_str.compare("JSONB")) {
248
234k
        type = FieldType::OLAP_FIELD_TYPE_JSONB;
249
2.93M
    } else if (0 == upper_type_str.compare("VARIANT")) {
250
65.6k
        type = FieldType::OLAP_FIELD_TYPE_VARIANT;
251
2.87M
    } else if (0 == upper_type_str.compare("BOOLEAN")) {
252
425k
        type = FieldType::OLAP_FIELD_TYPE_BOOL;
253
2.44M
    } else if (0 == upper_type_str.compare(0, 3, "HLL")) {
254
55.2k
        type = FieldType::OLAP_FIELD_TYPE_HLL;
255
2.39M
    } else if (0 == upper_type_str.compare("STRUCT")) {
256
76.8k
        type = FieldType::OLAP_FIELD_TYPE_STRUCT;
257
2.31M
    } else if (0 == upper_type_str.compare("LIST")) {
258
0
        type = FieldType::OLAP_FIELD_TYPE_ARRAY;
259
2.31M
    } else if (0 == upper_type_str.compare("MAP")) {
260
722k
        type = FieldType::OLAP_FIELD_TYPE_MAP;
261
1.59M
    } else if (0 == upper_type_str.compare("OBJECT")) {
262
46.2k
        type = FieldType::OLAP_FIELD_TYPE_BITMAP;
263
1.54M
    } else if (0 == upper_type_str.compare("BITMAP")) {
264
21.0k
        type = FieldType::OLAP_FIELD_TYPE_BITMAP;
265
1.54M
    } else if (0 == upper_type_str.compare("ARRAY")) {
266
1.54M
        type = FieldType::OLAP_FIELD_TYPE_ARRAY;
267
18.4E
    } else if (0 == upper_type_str.compare("QUANTILE_STATE")) {
268
49.4k
        type = FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE;
269
18.4E
    } else if (0 == upper_type_str.compare("AGG_STATE")) {
270
24.9k
        type = FieldType::OLAP_FIELD_TYPE_AGG_STATE;
271
18.4E
    } else {
272
18.4E
        LOG(WARNING) << "invalid type string. [type='" << type_str << "']";
273
18.4E
        type = FieldType::OLAP_FIELD_TYPE_UNKNOWN;
274
18.4E
    }
275
276
22.3M
    return type;
277
22.3M
}
278
279
21.8M
FieldAggregationMethod TabletColumn::get_aggregation_type_by_string(const std::string& str) {
280
21.8M
    std::string upper_str = str;
281
21.8M
    std::transform(str.begin(), str.end(), upper_str.begin(),
282
92.0M
                   [](auto c) { return std::toupper(c); });
283
21.8M
    FieldAggregationMethod aggregation_type;
284
285
21.8M
    if (0 == upper_str.compare("NONE")) {
286
20.2M
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE;
287
20.2M
    } else if (0 == upper_str.compare("SUM")) {
288
552k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_SUM;
289
1.03M
    } else if (0 == upper_str.compare("MIN")) {
290
14.1k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MIN;
291
1.02M
    } else if (0 == upper_str.compare("MAX")) {
292
38.7k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MAX;
293
985k
    } else if (0 == upper_str.compare("REPLACE")) {
294
779k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE;
295
779k
    } else if (0 == upper_str.compare("REPLACE_IF_NOT_NULL")) {
296
104k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL;
297
104k
    } else if (0 == upper_str.compare("HLL_UNION")) {
298
53.3k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_HLL_UNION;
299
58.7k
    } else if (0 == upper_str.compare("BITMAP_UNION")) {
300
58.7k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_BITMAP_UNION;
301
18.4E
    } else if (0 == upper_str.compare("QUANTILE_UNION")) {
302
49.0k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_QUANTILE_UNION;
303
18.4E
    } else if (!upper_str.empty()) {
304
11.3k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_GENERIC;
305
18.4E
    } else {
306
18.4E
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_UNKNOWN;
307
18.4E
    }
308
309
21.8M
    return aggregation_type;
310
21.8M
}
311
312
30.9M
std::string TabletColumn::get_string_by_field_type(FieldType type) {
313
30.9M
    switch (type) {
314
1.67M
    case FieldType::OLAP_FIELD_TYPE_TINYINT:
315
1.67M
        return "TINYINT";
316
317
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
318
0
        return "UNSIGNED_TINYINT";
319
320
822k
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
321
822k
        return "SMALLINT";
322
323
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
324
0
        return "UNSIGNED_SMALLINT";
325
326
3.77M
    case FieldType::OLAP_FIELD_TYPE_INT:
327
3.77M
        return "INT";
328
329
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT:
330
0
        return "UNSIGNED_INT";
331
332
3.41M
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
333
3.41M
        return "BIGINT";
334
335
954k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
336
954k
        return "LARGEINT";
337
338
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
339
0
        return "UNSIGNED_BIGINT";
340
341
26.1k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
342
26.1k
        return "IPV4";
343
344
27.0k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
345
27.0k
        return "IPV6";
346
347
712k
    case FieldType::OLAP_FIELD_TYPE_FLOAT:
348
712k
        return "FLOAT";
349
350
813k
    case FieldType::OLAP_FIELD_TYPE_DOUBLE:
351
813k
        return "DOUBLE";
352
353
0
    case FieldType::OLAP_FIELD_TYPE_DISCRETE_DOUBLE:
354
0
        return "DISCRETE_DOUBLE";
355
356
815k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
357
815k
        return "CHAR";
358
359
5.81k
    case FieldType::OLAP_FIELD_TYPE_DATE:
360
5.81k
        return "DATE";
361
362
2.14M
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
363
2.14M
        return "DATEV2";
364
365
9.41k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
366
9.41k
        return "DATETIME";
367
368
1.97M
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
369
1.97M
        return "DATETIMEV2";
370
371
207k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
372
207k
        return "TIMESTAMPTZ";
373
374
19.3k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
375
19.3k
        return "DECIMAL";
376
377
646k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
378
646k
        return "DECIMAL32";
379
380
827k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
381
827k
        return "DECIMAL64";
382
383
925k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
384
925k
        return "DECIMAL128I";
385
386
65.1k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
387
65.1k
        return "DECIMAL256";
388
389
5.37M
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
390
5.37M
        return "VARCHAR";
391
392
468k
    case FieldType::OLAP_FIELD_TYPE_JSONB:
393
468k
        return "JSONB";
394
395
94.6k
    case FieldType::OLAP_FIELD_TYPE_VARIANT:
396
94.6k
        return "VARIANT";
397
398
1.65M
    case FieldType::OLAP_FIELD_TYPE_STRING:
399
1.65M
        return "STRING";
400
401
679k
    case FieldType::OLAP_FIELD_TYPE_BOOL:
402
679k
        return "BOOLEAN";
403
404
113k
    case FieldType::OLAP_FIELD_TYPE_HLL:
405
113k
        return "HLL";
406
407
100k
    case FieldType::OLAP_FIELD_TYPE_STRUCT:
408
100k
        return "STRUCT";
409
410
1.92M
    case FieldType::OLAP_FIELD_TYPE_ARRAY:
411
1.92M
        return "ARRAY";
412
413
537k
    case FieldType::OLAP_FIELD_TYPE_MAP:
414
537k
        return "MAP";
415
416
127k
    case FieldType::OLAP_FIELD_TYPE_BITMAP:
417
127k
        return "OBJECT";
418
101k
    case FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE:
419
101k
        return "QUANTILE_STATE";
420
11.7k
    case FieldType::OLAP_FIELD_TYPE_AGG_STATE:
421
11.7k
        return "AGG_STATE";
422
0
    default:
423
0
        return "UNKNOWN";
424
30.9M
    }
425
30.9M
}
426
427
198k
std::string TabletColumn::get_string_by_aggregation_type(FieldAggregationMethod type) {
428
198k
    switch (type) {
429
129k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE:
430
129k
        return "NONE";
431
432
13.5k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_SUM:
433
13.5k
        return "SUM";
434
435
1.86k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MIN:
436
1.86k
        return "MIN";
437
438
5.49k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MAX:
439
5.49k
        return "MAX";
440
441
29.8k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE:
442
29.8k
        return "REPLACE";
443
444
14.3k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL:
445
14.3k
        return "REPLACE_IF_NOT_NULL";
446
447
1.29k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_HLL_UNION:
448
1.29k
        return "HLL_UNION";
449
450
1.76k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_BITMAP_UNION:
451
1.76k
        return "BITMAP_UNION";
452
453
972
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_QUANTILE_UNION:
454
972
        return "QUANTILE_UNION";
455
456
2
    default:
457
2
        return "UNKNOWN";
458
198k
    }
459
198k
}
460
461
7.72M
uint32_t TabletColumn::get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length) {
462
7.72M
    switch (type) {
463
564k
    case TPrimitiveType::TINYINT:
464
673k
    case TPrimitiveType::BOOLEAN:
465
673k
        return 1;
466
134k
    case TPrimitiveType::SMALLINT:
467
134k
        return 2;
468
809k
    case TPrimitiveType::INT:
469
809k
        return 4;
470
947k
    case TPrimitiveType::BIGINT:
471
947k
        return 8;
472
142k
    case TPrimitiveType::LARGEINT:
473
142k
        return 16;
474
13.6k
    case TPrimitiveType::IPV4:
475
13.6k
        return 4;
476
13.7k
    case TPrimitiveType::IPV6:
477
13.7k
        return 16;
478
1.79k
    case TPrimitiveType::DATE:
479
1.79k
        return 3;
480
420k
    case TPrimitiveType::DATEV2:
481
420k
        return 4;
482
2.75k
    case TPrimitiveType::DATETIME:
483
2.75k
        return 8;
484
402k
    case TPrimitiveType::DATETIMEV2:
485
426k
    case TPrimitiveType::TIMESTAMPTZ:
486
426k
        return 8;
487
116k
    case TPrimitiveType::FLOAT:
488
116k
        return 4;
489
197k
    case TPrimitiveType::DOUBLE:
490
197k
        return 8;
491
3.38k
    case TPrimitiveType::QUANTILE_STATE:
492
11.9k
    case TPrimitiveType::BITMAP:
493
11.9k
        return 16;
494
168k
    case TPrimitiveType::CHAR:
495
168k
        return string_length;
496
1.76M
    case TPrimitiveType::VARCHAR:
497
1.76M
    case TPrimitiveType::HLL:
498
1.76M
    case TPrimitiveType::AGG_STATE:
499
1.76M
        return string_length + sizeof(OLAP_VARCHAR_MAX_LENGTH);
500
351k
    case TPrimitiveType::STRING:
501
371k
    case TPrimitiveType::VARIANT:
502
371k
        return string_length + sizeof(OLAP_STRING_MAX_LENGTH);
503
38.1k
    case TPrimitiveType::JSONB:
504
38.1k
        return string_length + sizeof(OLAP_JSONB_MAX_LENGTH);
505
24.4k
    case TPrimitiveType::STRUCT:
506
        // Note that(xy): this is the length of struct type itself,
507
        // the length of its subtypes are not included.
508
24.4k
        return OLAP_STRUCT_MAX_LENGTH;
509
530k
    case TPrimitiveType::ARRAY:
510
530k
        return OLAP_ARRAY_MAX_LENGTH;
511
348k
    case TPrimitiveType::MAP:
512
348k
        return OLAP_MAP_MAX_LENGTH;
513
74.0k
    case TPrimitiveType::DECIMAL32:
514
74.0k
        return 4;
515
316k
    case TPrimitiveType::DECIMAL64:
516
316k
        return 8;
517
155k
    case TPrimitiveType::DECIMAL128I:
518
155k
        return 16;
519
22.6k
    case TPrimitiveType::DECIMAL256:
520
22.6k
        return 32;
521
13.2k
    case TPrimitiveType::DECIMALV2:
522
13.2k
        return 12; // use 12 bytes in olap engine.
523
0
    default:
524
0
        LOG(WARNING) << "unknown field type. [type=" << type << "]";
525
0
        return 0;
526
7.72M
    }
527
7.72M
}
528
529
17
bool TabletColumn::has_char_type() const {
530
17
    switch (_type) {
531
4
    case FieldType::OLAP_FIELD_TYPE_CHAR: {
532
4
        return true;
533
0
    }
534
4
    case FieldType::OLAP_FIELD_TYPE_ARRAY:
535
4
    case FieldType::OLAP_FIELD_TYPE_MAP:
536
4
    case FieldType::OLAP_FIELD_TYPE_STRUCT: {
537
4
        return std::any_of(_sub_columns.begin(), _sub_columns.end(),
538
4
                           [&](const auto& sub) -> bool { return sub->has_char_type(); });
539
4
    }
540
9
    default:
541
9
        return false;
542
17
    }
543
17
}
544
545
16.7M
TabletColumn::TabletColumn() : _aggregation(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE) {}
546
547
37
TabletColumn::TabletColumn(FieldAggregationMethod agg, FieldType type) {
548
37
    _aggregation = agg;
549
37
    _type = type;
550
37
}
551
552
17
TabletColumn::TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable) {
553
17
    _aggregation = agg;
554
17
    _type = filed_type;
555
17
    _length = cast_set<int32_t>(get_scalar_type_info(filed_type)->size());
556
17
    _is_nullable = is_nullable;
557
17
}
558
559
TabletColumn::TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable,
560
111k
                           int32_t unique_id, size_t length) {
561
111k
    _aggregation = agg;
562
111k
    _type = filed_type;
563
111k
    _is_nullable = is_nullable;
564
111k
    _unique_id = unique_id;
565
111k
    _length = cast_set<int32_t>(length);
566
111k
}
567
568
5.13k
TabletColumn::TabletColumn(const ColumnPB& column) {
569
5.13k
    init_from_pb(column);
570
5.13k
}
571
572
5.67M
TabletColumn::TabletColumn(const TColumn& column) {
573
5.67M
    init_from_thrift(column);
574
5.67M
}
575
576
6.32M
void TabletColumn::init_from_thrift(const TColumn& tcolumn) {
577
6.32M
    ColumnPB column_pb;
578
6.32M
    TabletMeta::init_column_from_tcolumn(tcolumn.col_unique_id, tcolumn, &column_pb);
579
6.32M
    init_from_pb(column_pb);
580
6.32M
}
581
582
21.8M
void TabletColumn::init_from_pb(const ColumnPB& column) {
583
21.8M
    _unique_id = column.unique_id();
584
21.8M
    _col_name = column.name();
585
21.8M
    _col_name_lower_case = to_lower(_col_name);
586
21.8M
    _type = TabletColumn::get_field_type_by_string(column.type());
587
21.8M
    _is_key = column.is_key();
588
21.8M
    _is_nullable = column.is_nullable();
589
21.8M
    _is_auto_increment = column.is_auto_increment();
590
21.8M
    if (column.has_is_on_update_current_timestamp()) {
591
18.4M
        _is_on_update_current_timestamp = column.is_on_update_current_timestamp();
592
18.4M
    }
593
594
21.8M
    _has_default_value = column.has_default_value();
595
21.8M
    if (_has_default_value) {
596
3.39M
        _default_value = column.default_value();
597
3.39M
    }
598
599
21.8M
    if (column.has_precision()) {
600
21.8M
        _is_decimal = true;
601
21.8M
        _precision = column.precision();
602
18.4E
    } else {
603
18.4E
        _is_decimal = false;
604
18.4E
    }
605
21.8M
    if (column.has_frac()) {
606
21.8M
        _frac = column.frac();
607
21.8M
    }
608
21.8M
    _length = column.length();
609
21.8M
    _index_length = column.index_length();
610
21.8M
    if (column.has_is_bf_column()) {
611
3.59M
        _is_bf_column = column.is_bf_column();
612
18.2M
    } else {
613
18.2M
        _is_bf_column = false;
614
18.2M
    }
615
21.8M
    if (column.has_aggregation()) {
616
21.8M
        _aggregation = get_aggregation_type_by_string(column.aggregation());
617
21.8M
        _aggregation_name = column.aggregation();
618
21.8M
    }
619
620
21.8M
    if (_type == FieldType::OLAP_FIELD_TYPE_AGG_STATE) {
621
12.4k
        _result_is_nullable = column.result_is_nullable();
622
12.4k
        _be_exec_version = column.be_exec_version();
623
12.4k
    }
624
625
21.9M
    if (column.has_visible()) {
626
21.9M
        _visible = column.visible();
627
21.9M
    }
628
21.8M
    if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
629
18.4E
        CHECK(column.children_columns_size() == 1)
630
18.4E
                << "ARRAY type should has 1 children types, but got "
631
18.4E
                << column.children_columns_size();
632
1.52M
    }
633
21.8M
    if (_type == FieldType::OLAP_FIELD_TYPE_MAP) {
634
18.4E
        DCHECK(column.children_columns_size() == 2)
635
18.4E
                << "MAP type should has 2 children types, but got "
636
18.4E
                << column.children_columns_size();
637
708k
        if (UNLIKELY(column.children_columns_size() != 2)) {
638
0
            LOG(WARNING) << "MAP type should has 2 children types, but got "
639
0
                         << column.children_columns_size();
640
0
        }
641
708k
    }
642
25.3M
    for (int i = 0; i < column.children_columns_size(); i++) {
643
3.50M
        TabletColumn child_column;
644
3.50M
        child_column.init_from_pb(column.children_columns(i));
645
3.50M
        add_sub_column(child_column);
646
3.50M
    }
647
21.8M
    if (column.has_column_path_info()) {
648
54.0k
        _column_path = std::make_shared<PathInData>();
649
54.0k
        _column_path->from_protobuf(column.column_path_info());
650
54.0k
        _parent_col_unique_id = column.column_path_info().parrent_column_unique_id();
651
54.0k
    }
652
21.8M
    if (is_variant_type() && !column.has_column_path_info()) {
653
        // set path info for variant root column, to prevent from missing
654
33.4k
        _column_path = std::make_shared<PathInData>(_col_name_lower_case);
655
        // _parent_col_unique_id = _unique_id;
656
33.4k
    }
657
21.8M
    if (column.has_variant_max_subcolumns_count()) {
658
18.4M
        _variant.max_subcolumns_count = column.variant_max_subcolumns_count();
659
18.4M
    }
660
21.8M
    if (column.has_variant_enable_typed_paths_to_sparse()) {
661
18.4M
        _variant.enable_typed_paths_to_sparse = column.variant_enable_typed_paths_to_sparse();
662
18.4M
    }
663
21.8M
    if (column.has_variant_max_sparse_column_statistics_size()) {
664
18.4M
        _variant.max_sparse_column_statistics_size =
665
18.4M
                column.variant_max_sparse_column_statistics_size();
666
18.4M
    }
667
21.8M
    if (column.has_variant_sparse_hash_shard_count()) {
668
17.0M
        _variant.sparse_hash_shard_count = column.variant_sparse_hash_shard_count();
669
17.0M
    }
670
21.8M
    if (column.has_variant_enable_doc_mode()) {
671
18.4M
        _variant.enable_doc_mode = column.variant_enable_doc_mode();
672
18.4M
    }
673
21.8M
    if (column.has_variant_doc_materialization_min_rows()) {
674
17.0M
        _variant.doc_materialization_min_rows = column.variant_doc_materialization_min_rows();
675
17.0M
    }
676
21.8M
    if (column.has_variant_doc_hash_shard_count()) {
677
17.1M
        _variant.doc_hash_shard_count = column.variant_doc_hash_shard_count();
678
17.1M
    }
679
21.8M
    if (column.has_variant_enable_nested_group()) {
680
17.1M
        _variant.enable_nested_group = column.variant_enable_nested_group();
681
17.1M
    }
682
21.8M
    if (column.has_pattern_type()) {
683
10.7M
        _pattern_type = column.pattern_type();
684
10.7M
    }
685
21.8M
}
686
687
TabletColumn TabletColumn::create_materialized_variant_column(const std::string& root,
688
                                                              const std::vector<std::string>& paths,
689
                                                              int32_t parent_unique_id,
690
                                                              int32_t max_subcolumns_count,
691
8.67k
                                                              bool enable_doc_mode) {
692
8.67k
    TabletColumn subcol;
693
8.67k
    subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
694
8.67k
    subcol.set_is_nullable(true);
695
8.67k
    subcol.set_unique_id(-1);
696
8.67k
    subcol.set_parent_unique_id(parent_unique_id);
697
8.67k
    PathInData path(root, paths);
698
8.67k
    subcol.set_path_info(path);
699
8.67k
    subcol.set_name(path.get_path());
700
8.67k
    subcol.set_variant_max_subcolumns_count(max_subcolumns_count);
701
8.67k
    subcol.set_variant_enable_doc_mode(enable_doc_mode);
702
8.67k
    return subcol;
703
8.67k
}
704
705
30.3M
void TabletColumn::to_schema_pb(ColumnPB* column) const {
706
30.3M
    column->set_unique_id(_unique_id);
707
30.3M
    column->set_name(_col_name);
708
30.3M
    column->set_type(get_string_by_field_type(_type));
709
30.3M
    column->set_is_key(_is_key);
710
30.3M
    column->set_is_nullable(_is_nullable);
711
30.3M
    column->set_is_auto_increment(_is_auto_increment);
712
30.3M
    column->set_is_on_update_current_timestamp(_is_on_update_current_timestamp);
713
30.3M
    if (_has_default_value) {
714
6.46M
        column->set_default_value(_default_value);
715
6.46M
    }
716
30.5M
    if (_is_decimal) {
717
30.5M
        column->set_precision(_precision);
718
30.5M
        column->set_frac(_frac);
719
30.5M
    }
720
30.3M
    column->set_length(_length);
721
30.3M
    column->set_index_length(_index_length);
722
30.3M
    if (_is_bf_column) {
723
287k
        column->set_is_bf_column(_is_bf_column);
724
287k
    }
725
30.7M
    if (!_aggregation_name.empty()) {
726
30.7M
        column->set_aggregation(_aggregation_name);
727
30.7M
    }
728
30.3M
    column->set_result_is_nullable(_result_is_nullable);
729
30.3M
    column->set_be_exec_version(_be_exec_version);
730
30.3M
    column->set_visible(_visible);
731
732
30.3M
    if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
733
1.91M
        CHECK(_sub_columns.size() == 1)
734
1.26k
                << "ARRAY type should has 1 children types, but got " << _sub_columns.size();
735
1.91M
    }
736
30.3M
    if (_type == FieldType::OLAP_FIELD_TYPE_MAP) {
737
18.4E
        DCHECK(_sub_columns.size() == 2)
738
18.4E
                << "MAP type should has 2 children types, but got " << _sub_columns.size();
739
526k
        if (UNLIKELY(_sub_columns.size() != 2)) {
740
0
            LOG(WARNING) << "MAP type should has 2 children types, but got " << _sub_columns.size();
741
0
        }
742
526k
    }
743
744
34.0M
    for (size_t i = 0; i < _sub_columns.size(); i++) {
745
3.64M
        ColumnPB* child = column->add_children_columns();
746
3.64M
        _sub_columns[i]->to_schema_pb(child);
747
3.64M
    }
748
749
    // set parts info
750
30.3M
    if (has_path_info()) {
751
        // CHECK_GT(_parent_col_unique_id, 0);
752
126k
        _column_path->to_protobuf(column->mutable_column_path_info(), _parent_col_unique_id);
753
        // Update unstable information for variant columns. Some of the fields in the tablet schema
754
        // are irrelevant for variant sub-columns, but retaining them may lead to an excessive growth
755
        // in the number of tablet schema cache entries.
756
126k
        if (_type == FieldType::OLAP_FIELD_TYPE_STRING) {
757
2.02k
            column->set_length(INT_MAX);
758
2.02k
        }
759
126k
        column->set_index_length(0);
760
126k
    }
761
30.3M
    column->set_variant_max_subcolumns_count(_variant.max_subcolumns_count);
762
30.3M
    column->set_pattern_type(_pattern_type);
763
30.3M
    column->set_variant_enable_typed_paths_to_sparse(_variant.enable_typed_paths_to_sparse);
764
30.3M
    column->set_variant_max_sparse_column_statistics_size(
765
30.3M
            _variant.max_sparse_column_statistics_size);
766
30.3M
    column->set_variant_sparse_hash_shard_count(_variant.sparse_hash_shard_count);
767
30.3M
    column->set_variant_enable_doc_mode(_variant.enable_doc_mode);
768
30.3M
    column->set_variant_doc_materialization_min_rows(_variant.doc_materialization_min_rows);
769
30.3M
    column->set_variant_doc_hash_shard_count(_variant.doc_hash_shard_count);
770
30.3M
    column->set_variant_enable_nested_group(_variant.enable_nested_group);
771
30.3M
}
772
773
3.55M
void TabletColumn::add_sub_column(TabletColumn& sub_column) {
774
3.55M
    _sub_columns.push_back(std::make_shared<TabletColumn>(sub_column));
775
3.55M
    sub_column._parent_col_unique_id = this->_unique_id;
776
3.55M
    _sub_column_count += 1;
777
3.55M
}
778
779
38.1M
bool TabletColumn::is_row_store_column() const {
780
38.1M
    return _col_name == BeConsts::ROW_STORE_COL;
781
38.1M
}
782
783
AggregateFunctionPtr TabletColumn::get_aggregate_function_union(DataTypePtr type,
784
1.37k
                                                                int current_be_exec_version) const {
785
1.37k
    const auto* state_type = assert_cast<const DataTypeAggState*>(type.get());
786
1.37k
    BeExecVersionManager::check_function_compatibility(
787
1.37k
            current_be_exec_version, _be_exec_version,
788
1.37k
            state_type->get_nested_function()->get_name());
789
1.37k
    return AggregateStateUnion::create(state_type->get_nested_function(), {type}, type);
790
1.37k
}
791
792
AggregateFunctionPtr TabletColumn::get_aggregate_function(std::string suffix,
793
69.4k
                                                          int current_be_exec_version) const {
794
69.4k
    AggregateFunctionPtr function = nullptr;
795
796
69.4k
    auto type = DataTypeFactory::instance().create_data_type(*this);
797
69.4k
    if (type && type->get_primitive_type() == PrimitiveType::TYPE_AGG_STATE) {
798
1.37k
        function = get_aggregate_function_union(type, current_be_exec_version);
799
68.0k
    } else {
800
68.0k
        std::string origin_name = TabletColumn::get_string_by_aggregation_type(_aggregation);
801
68.0k
        std::string agg_name = origin_name + suffix;
802
68.0k
        std::transform(agg_name.begin(), agg_name.end(), agg_name.begin(),
803
951k
                       [](unsigned char c) { return std::tolower(c); });
804
68.0k
        function = AggregateFunctionSimpleFactory::instance().get(
805
68.0k
                agg_name, {type}, type, type->is_nullable(),
806
68.0k
                BeExecVersionManager::get_newest_version());
807
68.0k
        if (!function) {
808
0
            LOG(WARNING) << "get column aggregate function failed, aggregation_name=" << origin_name
809
0
                         << ", column_type=" << type->get_name();
810
0
        }
811
68.0k
    }
812
69.4k
    if (function) {
813
69.4k
        function->set_version(_be_exec_version);
814
69.4k
        return function;
815
69.4k
    }
816
18.4E
    return nullptr;
817
69.4k
}
818
819
130k
void TabletColumn::set_path_info(const PathInData& path) {
820
130k
    _column_path = std::make_shared<PathInData>(path);
821
130k
}
822
823
16.0k
DataTypePtr TabletColumn::get_vec_type() const {
824
16.0k
    return DataTypeFactory::instance().create_data_type(*this);
825
16.0k
}
826
827
// escape '.' and '_'
828
55.4M
std::string escape_for_path_name(const std::string& s) {
829
55.4M
    std::string res;
830
55.4M
    const char* pos = s.data();
831
55.4M
    const char* end = pos + s.size();
832
56.0M
    while (pos != end) {
833
584k
        unsigned char c = *pos;
834
584k
        if (c == '.' || c == '_') {
835
75.6k
            res += '%';
836
75.6k
            res += hex_digit_uppercase(c / 16);
837
75.6k
            res += hex_digit_uppercase(c % 16);
838
508k
        } else {
839
508k
            res += c;
840
508k
        }
841
584k
        ++pos;
842
584k
    }
843
55.4M
    return res;
844
55.4M
}
845
846
9.10k
void TabletIndex::set_escaped_escaped_index_suffix_path(const std::string& path_name) {
847
9.10k
    std::string escaped_path = escape_for_path_name(path_name);
848
9.10k
    _escaped_index_suffix_path = escaped_path;
849
9.10k
}
850
851
void TabletIndex::init_from_thrift(const TOlapTableIndex& index,
852
172k
                                   const TabletSchema& tablet_schema) {
853
172k
    _index_id = index.index_id;
854
172k
    _index_name = index.index_name;
855
    // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
856
    // get column unique id by name
857
172k
    std::vector<int32_t> col_unique_ids(index.columns.size());
858
344k
    for (size_t i = 0; i < index.columns.size(); i++) {
859
172k
        auto column_idx = tablet_schema.field_index(index.columns[i]);
860
172k
        if (column_idx >= 0) {
861
172k
            col_unique_ids[i] = tablet_schema.column(column_idx).unique_id();
862
172k
        } else {
863
            // if column unique id not found by column name, find by column unique id
864
            // column unique id can not found means this column is a new column added by light schema change
865
122
            if (index.__isset.column_unique_ids && !index.column_unique_ids.empty() &&
866
122
                tablet_schema.has_column_unique_id(index.column_unique_ids[i])) {
867
90
                col_unique_ids[i] = index.column_unique_ids[i];
868
90
            } else {
869
32
                col_unique_ids[i] = -1;
870
32
            }
871
122
        }
872
172k
    }
873
172k
    _col_unique_ids = std::move(col_unique_ids);
874
875
172k
    switch (index.index_type) {
876
0
    case TIndexType::BITMAP:
877
0
        _index_type = IndexType::BITMAP;
878
0
        break;
879
156k
    case TIndexType::INVERTED:
880
156k
        _index_type = IndexType::INVERTED;
881
156k
        break;
882
426
    case TIndexType::ANN:
883
426
        _index_type = IndexType::ANN;
884
426
        break;
885
0
    case TIndexType::BLOOMFILTER:
886
0
        _index_type = IndexType::BLOOMFILTER;
887
0
        break;
888
15.4k
    case TIndexType::NGRAM_BF:
889
15.4k
        _index_type = IndexType::NGRAM_BF;
890
15.4k
        break;
891
172k
    }
892
172k
    if (index.__isset.properties) {
893
172k
        for (auto kv : index.properties) {
894
110k
            _properties[kv.first] = kv.second;
895
110k
        }
896
172k
    }
897
172k
}
898
899
void TabletIndex::init_from_thrift(const TOlapTableIndex& index,
900
10.5k
                                   const std::vector<int32_t>& column_uids) {
901
10.5k
    _index_id = index.index_id;
902
10.5k
    _index_name = index.index_name;
903
10.5k
    _col_unique_ids = column_uids;
904
905
10.5k
    switch (index.index_type) {
906
0
    case TIndexType::BITMAP:
907
0
        _index_type = IndexType::BITMAP;
908
0
        break;
909
10.0k
    case TIndexType::INVERTED:
910
10.0k
        _index_type = IndexType::INVERTED;
911
10.0k
        break;
912
95
    case TIndexType::ANN:
913
95
        _index_type = IndexType::ANN;
914
95
        break;
915
0
    case TIndexType::BLOOMFILTER:
916
0
        _index_type = IndexType::BLOOMFILTER;
917
0
        break;
918
467
    case TIndexType::NGRAM_BF:
919
467
        _index_type = IndexType::NGRAM_BF;
920
467
        break;
921
10.5k
    }
922
10.5k
    if (index.__isset.properties) {
923
11.9k
        for (auto kv : index.properties) {
924
11.9k
            _properties[kv.first] = kv.second;
925
11.9k
        }
926
10.5k
    }
927
10.5k
}
928
929
981k
void TabletIndex::init_from_pb(const TabletIndexPB& index) {
930
981k
    _index_id = index.index_id();
931
981k
    _index_name = index.index_name();
932
981k
    _col_unique_ids.clear();
933
982k
    for (auto col_unique_id : index.col_unique_id()) {
934
982k
        _col_unique_ids.push_back(col_unique_id);
935
982k
    }
936
981k
    _index_type = index.index_type();
937
981k
    for (const auto& kv : index.properties()) {
938
679k
        _properties[kv.first] = kv.second;
939
679k
    }
940
981k
    _escaped_index_suffix_path = index.index_suffix_name();
941
981k
}
942
943
2.01M
void TabletIndex::to_schema_pb(TabletIndexPB* index) const {
944
2.01M
    index->set_index_id(_index_id);
945
2.01M
    index->set_index_name(_index_name);
946
2.01M
    index->clear_col_unique_id();
947
2.02M
    for (auto col_unique_id : _col_unique_ids) {
948
2.02M
        index->add_col_unique_id(col_unique_id);
949
2.02M
    }
950
2.01M
    index->set_index_type(_index_type);
951
2.01M
    for (const auto& kv : _properties) {
952
1.69M
        DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", {
953
1.69M
            if (kv.first == INVERTED_INDEX_PARSER_LOWERCASE_KEY) {
954
1.69M
                continue;
955
1.69M
            }
956
1.69M
        })
957
1.69M
        (*index->mutable_properties())[kv.first] = kv.second;
958
1.69M
    }
959
2.01M
    index->set_index_suffix_name(_escaped_index_suffix_path);
960
961
2.01M
    DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })
962
963
    // Only add lower_case=true default for built-in analyzers/parsers, NOT for custom analyzers
964
    // Custom analyzer: lower_case is determined by analyzer's internal token filter
965
2.01M
    if (!_properties.empty() && !_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
966
41.1k
        bool has_parser = _properties.contains(INVERTED_INDEX_PARSER_KEY) ||
967
41.1k
                          _properties.contains(INVERTED_INDEX_PARSER_KEY_ALIAS);
968
41.1k
        std::string analyzer_name = get_analyzer_name_from_properties(_properties);
969
41.1k
        bool is_builtin = analyzer_name.empty() ||
970
41.1k
                          segment_v2::inverted_index::InvertedIndexAnalyzer::is_builtin_analyzer(
971
2.12k
                                  analyzer_name);
972
41.1k
        if (has_parser || is_builtin) {
973
39.0k
            (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
974
39.0k
                    INVERTED_INDEX_PARSER_TRUE;
975
39.0k
        }
976
41.1k
    }
977
2.01M
}
978
979
2.16M
TabletSchema::TabletSchema() = default;
980
981
2.13M
TabletSchema::~TabletSchema() {}
982
983
1.14M
int64_t TabletSchema::get_metadata_size() const {
984
1.14M
    return sizeof(TabletSchema);
985
1.14M
}
986
987
5.81M
void TabletSchema::append_column(TabletColumn column, ColumnType col_type) {
988
5.81M
    if (column.is_key()) {
989
1.43M
        _num_key_columns++;
990
1.43M
    }
991
5.81M
    if (column.is_nullable()) {
992
3.39M
        _num_null_columns++;
993
3.39M
    }
994
5.81M
    if (column.is_variant_type()) {
995
28.0k
        ++_num_variant_columns;
996
28.0k
        if (!column.has_path_info()) {
997
23
            const std::string& col_name = column.name_lower_case();
998
23
            PathInData path(col_name);
999
23
            column.set_path_info(path);
1000
23
        }
1001
28.0k
    }
1002
5.81M
    if (UNLIKELY(column.name() == DELETE_SIGN)) {
1003
148k
        _delete_sign_idx = _num_columns;
1004
5.66M
    } else if (UNLIKELY(column.name() == SEQUENCE_COL)) {
1005
4.44k
        _sequence_col_idx = _num_columns;
1006
5.65M
    } else if (UNLIKELY(column.name() == VERSION_COL)) {
1007
148k
        _version_col_idx = _num_columns;
1008
5.51M
    } else if (UNLIKELY(column.name() == SKIP_BITMAP_COL)) {
1009
771
        _skip_bitmap_col_idx = _num_columns;
1010
5.51M
    } else if (UNLIKELY(column.name().starts_with(BeConsts::VIRTUAL_COLUMN_PREFIX))) {
1011
380
        _vir_col_idx_to_unique_id[_num_columns] = column.unique_id();
1012
380
    }
1013
5.81M
    _field_uniqueid_to_index[column.unique_id()] = _num_columns;
1014
5.81M
    _cols.push_back(std::make_shared<TabletColumn>(std::move(column)));
1015
    // The dropped column may have same name with exsiting column, so that
1016
    // not add to name to index map, only for uid to index map
1017
5.82M
    if (col_type == ColumnType::VARIANT || _cols.back()->is_variant_type() ||
1018
5.81M
        _cols.back()->is_extracted_column()) {
1019
33.5k
        _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1020
33.5k
        _field_path_to_index[_cols.back()->path_info_ptr().get()] = _num_columns;
1021
5.79M
    } else if (col_type == ColumnType::NORMAL) {
1022
5.79M
        _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1023
5.79M
    }
1024
5.81M
    _num_columns++;
1025
5.81M
    _num_virtual_columns = _vir_col_idx_to_unique_id.size();
1026
    // generate column index mapping for seq map
1027
5.81M
    if (_seq_col_uid_to_value_cols_uid.contains(column.unique_id())) {
1028
19
        const auto seq_idx = _field_uniqueid_to_index[column.unique_id()];
1029
19
        if (!_seq_col_idx_to_value_cols_idx.contains(seq_idx)) {
1030
15
            _seq_col_idx_to_value_cols_idx[seq_idx] = {};
1031
15
        }
1032
19
    }
1033
5.81M
    if (_value_col_uid_to_seq_col_uid.contains(column.unique_id())) {
1034
46
        const auto seq_uid = _value_col_uid_to_seq_col_uid[column.unique_id()];
1035
46
        if (_field_uniqueid_to_index.contains(seq_uid)) {
1036
22
            bool all_uid_index_found = true;
1037
22
            std::vector<int32_t> value_cols_index;
1038
31
            for (const auto value_col_uid : _seq_col_uid_to_value_cols_uid[seq_uid]) {
1039
31
                if (!_field_uniqueid_to_index.contains(value_col_uid)) {
1040
3
                    all_uid_index_found = false;
1041
3
                    break;
1042
3
                }
1043
28
                value_cols_index.push_back(_field_uniqueid_to_index[value_col_uid]);
1044
28
            }
1045
22
            if (all_uid_index_found) {
1046
19
                const auto seq_idx = _field_uniqueid_to_index[seq_uid];
1047
27
                for (const auto col_idx : value_cols_index) {
1048
27
                    _seq_col_idx_to_value_cols_idx[seq_idx].push_back(col_idx);
1049
27
                    _value_col_idx_to_seq_col_idx[col_idx] = seq_idx;
1050
27
                }
1051
19
                _value_col_idx_to_seq_col_idx[seq_idx] = seq_idx;
1052
19
            }
1053
22
        }
1054
46
    }
1055
5.81M
}
1056
1057
1.63k
void TabletSchema::append_index(TabletIndex&& index) {
1058
1.63k
    size_t index_pos = _indexes.size();
1059
1.63k
    _indexes.push_back(std::make_shared<TabletIndex>(index));
1060
1.63k
    for (int32_t id : _indexes.back()->col_unique_ids()) {
1061
1.63k
        if (auto field_pattern = _indexes.back()->field_pattern(); !field_pattern.empty()) {
1062
6
            auto& pattern_to_index_map = _index_by_unique_id_with_pattern[id];
1063
6
            pattern_to_index_map[field_pattern].emplace_back(_indexes.back());
1064
1.63k
        } else {
1065
1.63k
            IndexKey key = std::make_tuple(_indexes.back()->index_type(), id,
1066
1.63k
                                           _indexes.back()->get_index_suffix());
1067
1.63k
            _col_id_suffix_to_index[key].push_back(index_pos);
1068
1.63k
        }
1069
1.63k
    }
1070
1.63k
}
1071
1072
0
void TabletSchema::replace_column(size_t pos, TabletColumn new_col) {
1073
0
    CHECK_LT(pos, num_columns()) << " outof range";
1074
0
    _cols[pos] = std::make_shared<TabletColumn>(std::move(new_col));
1075
0
}
1076
1077
734
void TabletSchema::clear_index() {
1078
734
    _indexes.clear();
1079
734
    _col_id_suffix_to_index.clear();
1080
734
    _index_by_unique_id_with_pattern.clear();
1081
734
}
1082
1083
7
void TabletSchema::remove_index(int64_t index_id) {
1084
7
    std::vector<TabletIndexPtr> new_indexes;
1085
11
    for (auto& index : _indexes) {
1086
11
        if (index->index_id() != index_id) {
1087
4
            new_indexes.emplace_back(std::move(index));
1088
4
        }
1089
11
    }
1090
7
    _indexes = std::move(new_indexes);
1091
7
    _col_id_suffix_to_index.clear();
1092
7
    _index_by_unique_id_with_pattern.clear();
1093
11
    for (size_t new_pos = 0; new_pos < _indexes.size(); ++new_pos) {
1094
4
        const auto& index = _indexes[new_pos];
1095
4
        for (int32_t col_uid : index->col_unique_ids()) {
1096
4
            if (auto field_pattern = index->field_pattern(); !field_pattern.empty()) {
1097
0
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1098
0
                pattern_to_index_map[field_pattern].emplace_back(index);
1099
4
            } else {
1100
4
                IndexKey key = std::make_tuple(_indexes.back()->index_type(), col_uid,
1101
4
                                               _indexes.back()->get_index_suffix());
1102
4
                _col_id_suffix_to_index[key].push_back(new_pos);
1103
4
            }
1104
4
        }
1105
4
    }
1106
7
}
1107
1108
394k
void TabletSchema::clear_columns() {
1109
394k
    _field_path_to_index.clear();
1110
394k
    _field_name_to_index.clear();
1111
394k
    _field_uniqueid_to_index.clear();
1112
394k
    _num_columns = 0;
1113
394k
    _num_variant_columns = 0;
1114
394k
    _num_null_columns = 0;
1115
394k
    _num_key_columns = 0;
1116
394k
    _seq_col_idx_to_value_cols_idx.clear();
1117
394k
    _value_col_idx_to_seq_col_idx.clear();
1118
394k
    _cols.clear();
1119
394k
}
1120
1121
void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns,
1122
959k
                                bool reuse_cache_column) {
1123
959k
    _keys_type = schema.keys_type();
1124
959k
    _num_columns = 0;
1125
959k
    _num_variant_columns = 0;
1126
959k
    _num_key_columns = 0;
1127
959k
    _num_null_columns = 0;
1128
959k
    _cols.clear();
1129
959k
    _indexes.clear();
1130
959k
    _index_by_unique_id_with_pattern.clear();
1131
959k
    _col_id_suffix_to_index.clear();
1132
959k
    _field_name_to_index.clear();
1133
959k
    _field_uniqueid_to_index.clear();
1134
959k
    _cluster_key_uids.clear();
1135
959k
    for (const auto& i : schema.cluster_key_uids()) {
1136
39.8k
        _cluster_key_uids.push_back(i);
1137
39.8k
    }
1138
12.1M
    for (auto& column_pb : schema.column()) {
1139
12.1M
        TabletColumnPtr column;
1140
12.1M
        if (reuse_cache_column) {
1141
502k
            auto pair = TabletColumnObjectPool::instance()->insert(
1142
502k
                    deterministic_string_serialize(column_pb));
1143
502k
            column = pair.second;
1144
            // Release the handle quickly, because we use shared ptr to manage column.
1145
            // It often core during tablet schema copy to another schema because handle's
1146
            // reference count should be managed mannually.
1147
502k
            TabletColumnObjectPool::instance()->release(pair.first);
1148
11.6M
        } else {
1149
11.6M
            column = std::make_shared<TabletColumn>();
1150
11.6M
            column->init_from_pb(column_pb);
1151
11.6M
        }
1152
12.1M
        if (ignore_extracted_columns && column->is_extracted_column()) {
1153
0
            continue;
1154
0
        }
1155
12.1M
        if (column->is_key()) {
1156
2.06M
            _num_key_columns++;
1157
2.06M
        }
1158
12.1M
        if (column->is_nullable()) {
1159
6.92M
            _num_null_columns++;
1160
6.92M
        }
1161
12.1M
        if (column->is_variant_type()) {
1162
39.9k
            ++_num_variant_columns;
1163
39.9k
        }
1164
1165
12.1M
        _cols.emplace_back(std::move(column));
1166
12.1M
        if (!_cols.back()->is_extracted_column()) {
1167
12.1M
            _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1168
12.1M
            _field_uniqueid_to_index[_cols.back()->unique_id()] = _num_columns;
1169
12.1M
        }
1170
12.1M
        _num_columns++;
1171
12.1M
    }
1172
1.00M
    for (const auto& index_pb : schema.index()) {
1173
1.00M
        TabletIndexPtr index;
1174
1.00M
        if (reuse_cache_column) {
1175
43.4k
            auto pair = TabletColumnObjectPool::instance()->insert_index(
1176
43.4k
                    deterministic_string_serialize(index_pb));
1177
43.4k
            index = pair.second;
1178
            //  Only need the value to be cached by the pool, release it quickly because the handle need
1179
            // record reference count mannually, or it will core during tablet schema copy method.
1180
43.4k
            TabletColumnObjectPool::instance()->release(pair.first);
1181
963k
        } else {
1182
963k
            index = std::make_shared<TabletIndex>();
1183
963k
            index->init_from_pb(index_pb);
1184
963k
        }
1185
1.00M
        size_t index_pos = _indexes.size();
1186
1.00M
        _indexes.emplace_back(std::move(index));
1187
1.00M
        for (int32_t col_uid : _indexes.back()->col_unique_ids()) {
1188
1.00M
            if (auto field_pattern = _indexes.back()->field_pattern(); !field_pattern.empty()) {
1189
24.0k
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1190
24.0k
                pattern_to_index_map[field_pattern].emplace_back(_indexes.back());
1191
982k
            } else {
1192
982k
                IndexKey key = std::make_tuple(_indexes.back()->index_type(), col_uid,
1193
982k
                                               _indexes.back()->get_index_suffix());
1194
982k
                _col_id_suffix_to_index[key].push_back(index_pos);
1195
982k
            }
1196
1.00M
        }
1197
1.00M
    }
1198
959k
    _num_short_key_columns = schema.num_short_key_columns();
1199
959k
    _num_rows_per_row_block = schema.num_rows_per_row_block();
1200
959k
    _compress_kind = schema.compress_kind();
1201
959k
    _next_column_unique_id = schema.next_column_unique_id();
1202
959k
    if (schema.has_bf_fpp()) {
1203
520k
        _has_bf_fpp = true;
1204
520k
        _bf_fpp = schema.bf_fpp();
1205
520k
    } else {
1206
439k
        _has_bf_fpp = false;
1207
439k
        _bf_fpp = BLOOM_FILTER_DEFAULT_FPP;
1208
439k
    }
1209
959k
    _is_in_memory = schema.is_in_memory();
1210
959k
    _disable_auto_compaction = schema.disable_auto_compaction();
1211
959k
    _enable_single_replica_compaction = schema.enable_single_replica_compaction();
1212
959k
    _store_row_column = schema.store_row_column();
1213
959k
    _skip_write_index_on_load = schema.skip_write_index_on_load();
1214
959k
    _delete_sign_idx = schema.delete_sign_idx();
1215
959k
    _sequence_col_idx = schema.sequence_col_idx();
1216
959k
    _version_col_idx = schema.version_col_idx();
1217
959k
    _skip_bitmap_col_idx = schema.skip_bitmap_col_idx();
1218
959k
    _sort_type = schema.sort_type();
1219
959k
    _sort_col_num = schema.sort_col_num();
1220
959k
    _compression_type = schema.compression_type();
1221
959k
    _row_store_page_size = schema.row_store_page_size();
1222
959k
    _storage_page_size = schema.storage_page_size();
1223
959k
    _storage_dict_page_size = schema.storage_dict_page_size();
1224
959k
    _schema_version = schema.schema_version();
1225
959k
    if (schema.has_seq_map()) {
1226
958k
        auto column_groups_pb = schema.seq_map();
1227
958k
        _seq_col_uid_to_value_cols_uid.clear();
1228
958k
        _value_col_uid_to_seq_col_uid.clear();
1229
958k
        _seq_col_idx_to_value_cols_idx.clear();
1230
958k
        _value_col_idx_to_seq_col_idx.clear();
1231
        /*
1232
         * ColumnGroupsPB is a list of cg_pb, and
1233
         * ColumnGroupsPB do not have begin() or end() method.
1234
         * we must use for(i=0;i<xx;i++) loop
1235
         */
1236
959k
        for (int i = 0; i < column_groups_pb.cg_size(); i++) {
1237
278
            ColumnGroupPB cg_pb = column_groups_pb.cg(i);
1238
278
            uint32_t key_uid = cg_pb.sequence_column();
1239
278
            auto found = _field_uniqueid_to_index.find(key_uid);
1240
278
            DCHECK(found != _field_uniqueid_to_index.end())
1241
0
                    << "could not find sequence col with unique id = " << key_uid
1242
0
                    << " table_id=" << _table_id;
1243
278
            int32_t seq_index = found->second;
1244
278
            _seq_col_uid_to_value_cols_uid[key_uid] = {};
1245
278
            _seq_col_idx_to_value_cols_idx[seq_index] = {};
1246
415
            for (auto val_uid : cg_pb.columns_in_group()) {
1247
415
                _seq_col_uid_to_value_cols_uid[key_uid].push_back(val_uid);
1248
415
                found = _field_uniqueid_to_index.find(val_uid);
1249
415
                DCHECK(found != _field_uniqueid_to_index.end())
1250
0
                        << "could not find value col with unique id = " << key_uid
1251
0
                        << " table_id=" << _table_id;
1252
415
                int32_t val_index = found->second;
1253
415
                _seq_col_idx_to_value_cols_idx[seq_index].push_back(val_index);
1254
415
            }
1255
278
        }
1256
1257
958k
        if (!_seq_col_uid_to_value_cols_uid.empty()) {
1258
            /*
1259
                |** KEY **|        ** VALUE **     |
1260
                ------------------------------------
1261
                |** KEY **|  CDE is value| sequence|
1262
                |----|----|----|----|----|----|----|
1263
                A    B    C    D    E   S1   S2
1264
                0    1    2    3    4    5    6
1265
                for example: _seq_map is {5:{2,3}, 6:{4}}
1266
                then, _value_to_seq = {2:5,3:5,5:5,4:6,6:6}
1267
            */
1268
278
            for (auto& [seq_uid, cols_uid] : _seq_col_uid_to_value_cols_uid) {
1269
415
                for (auto col_uid : cols_uid) {
1270
415
                    _value_col_uid_to_seq_col_uid[col_uid] = seq_uid;
1271
415
                }
1272
278
                _value_col_uid_to_seq_col_uid[seq_uid] = seq_uid;
1273
278
            }
1274
1275
278
            for (auto& [seq_idx, value_cols_idx] : _seq_col_idx_to_value_cols_idx) {
1276
415
                for (auto col_idx : value_cols_idx) {
1277
415
                    _value_col_idx_to_seq_col_idx[col_idx] = seq_idx;
1278
415
                }
1279
278
                _value_col_idx_to_seq_col_idx[seq_idx] = seq_idx;
1280
278
            }
1281
195
        }
1282
958k
    }
1283
    // Default to V1 inverted index storage format for backward compatibility if not specified in schema.
1284
959k
    if (!schema.has_inverted_index_storage_format()) {
1285
295
        _inverted_index_storage_format = InvertedIndexStorageFormatPB::V1;
1286
959k
    } else {
1287
959k
        _inverted_index_storage_format = schema.inverted_index_storage_format();
1288
959k
    }
1289
1290
959k
    _row_store_column_unique_ids.assign(schema.row_store_column_unique_ids().begin(),
1291
959k
                                        schema.row_store_column_unique_ids().end());
1292
959k
    _deprecated_enable_variant_flatten_nested = schema.enable_variant_flatten_nested();
1293
959k
    if (schema.has_is_external_segment_column_meta_used()) {
1294
951k
        _is_external_segment_column_meta_used = schema.is_external_segment_column_meta_used();
1295
951k
    } else {
1296
7.43k
        _is_external_segment_column_meta_used = false;
1297
7.43k
    }
1298
959k
    if (schema.has_integer_type_default_use_plain_encoding()) {
1299
783k
        _integer_type_default_use_plain_encoding = schema.integer_type_default_use_plain_encoding();
1300
783k
    }
1301
959k
    if (schema.has_binary_plain_encoding_default_impl()) {
1302
783k
        _binary_plain_encoding_default_impl = schema.binary_plain_encoding_default_impl();
1303
783k
    }
1304
959k
    update_metadata_size();
1305
959k
}
1306
1307
212k
void TabletSchema::copy_from(const TabletSchema& tablet_schema) {
1308
212k
    TabletSchemaPB tablet_schema_pb;
1309
212k
    tablet_schema.to_schema_pb(&tablet_schema_pb);
1310
212k
    init_from_pb(tablet_schema_pb);
1311
212k
    _table_id = tablet_schema.table_id();
1312
212k
    _path_set_info_map = tablet_schema._path_set_info_map;
1313
212k
}
1314
1315
203k
void TabletSchema::shawdow_copy_without_columns(const TabletSchema& tablet_schema) {
1316
203k
    *this = tablet_schema;
1317
203k
    _field_path_to_index.clear();
1318
203k
    _field_name_to_index.clear();
1319
203k
    _field_uniqueid_to_index.clear();
1320
203k
    _num_columns = 0;
1321
203k
    _num_variant_columns = 0;
1322
203k
    _num_null_columns = 0;
1323
203k
    _num_key_columns = 0;
1324
203k
    _cols.clear();
1325
203k
    _delete_sign_idx = -1;
1326
203k
    _sequence_col_idx = -1;
1327
203k
    _version_col_idx = -1;
1328
203k
}
1329
1330
0
void TabletSchema::update_index_info_from(const TabletSchema& tablet_schema) {
1331
0
    for (auto& col : _cols) {
1332
0
        if (col->unique_id() < 0) {
1333
0
            continue;
1334
0
        }
1335
0
        const auto iter = tablet_schema._field_uniqueid_to_index.find(col->unique_id());
1336
0
        if (iter == tablet_schema._field_uniqueid_to_index.end()) {
1337
0
            continue;
1338
0
        }
1339
0
        auto col_idx = iter->second;
1340
0
        if (col_idx < 0 || col_idx >= tablet_schema._cols.size()) {
1341
0
            continue;
1342
0
        }
1343
0
        col->set_is_bf_column(tablet_schema._cols[col_idx]->is_bf_column());
1344
0
    }
1345
0
}
1346
1347
1.13M
std::string TabletSchema::to_key() const {
1348
1.13M
    TabletSchemaPB pb;
1349
1.13M
    to_schema_pb(&pb);
1350
1.13M
    return TabletSchema::deterministic_string_serialize(pb);
1351
1.13M
}
1352
1353
void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version,
1354
                                               const OlapTableIndexSchema* index,
1355
187k
                                               const TabletSchema& ori_tablet_schema) {
1356
    // copy from ori_tablet_schema
1357
187k
    _keys_type = ori_tablet_schema.keys_type();
1358
187k
    _num_short_key_columns = ori_tablet_schema.num_short_key_columns();
1359
187k
    _num_rows_per_row_block = ori_tablet_schema.num_rows_per_row_block();
1360
187k
    _compress_kind = ori_tablet_schema.compress_kind();
1361
1362
    // todo(yixiu): unique_id
1363
187k
    _next_column_unique_id = ori_tablet_schema.next_column_unique_id();
1364
187k
    _is_in_memory = ori_tablet_schema.is_in_memory();
1365
187k
    _disable_auto_compaction = ori_tablet_schema.disable_auto_compaction();
1366
187k
    _enable_single_replica_compaction = ori_tablet_schema.enable_single_replica_compaction();
1367
187k
    _skip_write_index_on_load = ori_tablet_schema.skip_write_index_on_load();
1368
187k
    _sort_type = ori_tablet_schema.sort_type();
1369
187k
    _sort_col_num = ori_tablet_schema.sort_col_num();
1370
187k
    _row_store_page_size = ori_tablet_schema.row_store_page_size();
1371
187k
    _storage_page_size = ori_tablet_schema.storage_page_size();
1372
187k
    _storage_dict_page_size = ori_tablet_schema.storage_dict_page_size();
1373
187k
    _deprecated_enable_variant_flatten_nested =
1374
187k
            ori_tablet_schema.deprecated_variant_flatten_nested();
1375
1376
    // copy from table_schema_param
1377
187k
    _schema_version = version;
1378
187k
    _num_columns = 0;
1379
187k
    _num_variant_columns = 0;
1380
187k
    _num_key_columns = 0;
1381
187k
    _num_null_columns = 0;
1382
187k
    bool has_bf_columns = false;
1383
187k
    _cols.clear();
1384
187k
    _indexes.clear();
1385
187k
    _col_id_suffix_to_index.clear();
1386
187k
    _index_by_unique_id_with_pattern.clear();
1387
187k
    _field_name_to_index.clear();
1388
187k
    _field_uniqueid_to_index.clear();
1389
187k
    _delete_sign_idx = -1;
1390
187k
    _sequence_col_idx = -1;
1391
187k
    _version_col_idx = -1;
1392
187k
    _skip_bitmap_col_idx = -1;
1393
187k
    _cluster_key_uids.clear();
1394
187k
    for (const auto& i : ori_tablet_schema._cluster_key_uids) {
1395
6.22k
        _cluster_key_uids.push_back(i);
1396
6.22k
    }
1397
2.08M
    for (auto& column : index->columns) {
1398
2.08M
        if (column->is_key()) {
1399
513k
            _num_key_columns++;
1400
513k
        }
1401
2.08M
        if (column->is_nullable()) {
1402
1.15M
            _num_null_columns++;
1403
1.15M
        }
1404
2.08M
        if (column->is_bf_column()) {
1405
8.49k
            has_bf_columns = true;
1406
8.49k
        }
1407
2.08M
        if (column->is_variant_type()) {
1408
11.2k
            ++_num_variant_columns;
1409
11.2k
        }
1410
2.08M
        if (UNLIKELY(column->name() == DELETE_SIGN)) {
1411
60.5k
            _delete_sign_idx = _num_columns;
1412
2.02M
        } else if (UNLIKELY(column->name() == SEQUENCE_COL)) {
1413
3.82k
            _sequence_col_idx = _num_columns;
1414
2.02M
        } else if (UNLIKELY(column->name() == VERSION_COL)) {
1415
60.2k
            _version_col_idx = _num_columns;
1416
1.96M
        } else if (UNLIKELY(column->name() == SKIP_BITMAP_COL)) {
1417
339
            _skip_bitmap_col_idx = _num_columns;
1418
339
        }
1419
        // Reuse TabletColumn object from pool to reduce memory consumption
1420
2.08M
        TabletColumnPtr new_column;
1421
2.08M
        ColumnPB column_pb;
1422
2.08M
        column->to_schema_pb(&column_pb);
1423
2.08M
        auto pair = TabletColumnObjectPool::instance()->insert(
1424
2.08M
                deterministic_string_serialize(column_pb));
1425
2.08M
        new_column = pair.second;
1426
        // Release the handle quickly, because we use shared ptr to manage column
1427
2.08M
        TabletColumnObjectPool::instance()->release(pair.first);
1428
2.08M
        _cols.emplace_back(std::move(new_column));
1429
2.08M
        _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1430
2.08M
        _field_uniqueid_to_index[_cols.back()->unique_id()] = _num_columns;
1431
2.08M
        _num_columns++;
1432
2.08M
    }
1433
1434
187k
    for (const auto& i : index->indexes) {
1435
126k
        size_t index_pos = _indexes.size();
1436
        // Reuse TabletIndex object from pool to reduce memory consumption
1437
126k
        TabletIndexPtr new_index;
1438
126k
        TabletIndexPB index_pb;
1439
126k
        i->to_schema_pb(&index_pb);
1440
126k
        auto pair = TabletColumnObjectPool::instance()->insert_index(
1441
126k
                deterministic_string_serialize(index_pb));
1442
126k
        new_index = pair.second;
1443
        // Release the handle quickly, because we use shared ptr to manage index
1444
126k
        TabletColumnObjectPool::instance()->release(pair.first);
1445
126k
        _indexes.emplace_back(std::move(new_index));
1446
127k
        for (int32_t col_uid : _indexes.back()->col_unique_ids()) {
1447
127k
            if (auto field_pattern = _indexes.back()->field_pattern(); !field_pattern.empty()) {
1448
1.79k
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1449
1.79k
                pattern_to_index_map[field_pattern].emplace_back(_indexes.back());
1450
125k
            } else {
1451
125k
                IndexKey key = std::make_tuple(_indexes.back()->index_type(), col_uid,
1452
125k
                                               _indexes.back()->get_index_suffix());
1453
125k
                _col_id_suffix_to_index[key].push_back(index_pos);
1454
125k
            }
1455
127k
        }
1456
126k
    }
1457
1458
187k
    if (has_bf_columns) {
1459
7.92k
        _has_bf_fpp = true;
1460
7.92k
        _bf_fpp = ori_tablet_schema.bloom_filter_fpp();
1461
179k
    } else {
1462
179k
        _has_bf_fpp = false;
1463
179k
        _bf_fpp = BLOOM_FILTER_DEFAULT_FPP;
1464
179k
    }
1465
187k
}
1466
1467
7.88k
void TabletSchema::merge_dropped_columns(const TabletSchema& src_schema) {
1468
    // If they are the same tablet schema object, then just return
1469
7.88k
    if (this == &src_schema) {
1470
0
        return;
1471
0
    }
1472
119k
    for (const auto& src_col : src_schema.columns()) {
1473
119k
        if (_field_uniqueid_to_index.find(src_col->unique_id()) == _field_uniqueid_to_index.end()) {
1474
51
            CHECK(!src_col->is_key())
1475
0
                    << src_col->name() << " is key column, should not be dropped.";
1476
51
            ColumnPB src_col_pb;
1477
            // There are some pointer in tablet column, not sure the reference relation, so
1478
            // that deep copy it.
1479
51
            src_col->to_schema_pb(&src_col_pb);
1480
51
            TabletColumn new_col(src_col_pb);
1481
51
            append_column(new_col, TabletSchema::ColumnType::DROPPED);
1482
51
        }
1483
119k
    }
1484
7.88k
}
1485
1486
8.51k
TabletSchemaSPtr TabletSchema::copy_without_variant_extracted_columns() {
1487
8.51k
    TabletSchemaSPtr copy = std::make_shared<TabletSchema>();
1488
8.51k
    copy->shawdow_copy_without_columns(*this);
1489
59.8k
    for (auto& col : this->columns()) {
1490
59.8k
        if (col->is_extracted_column()) {
1491
4.36k
            continue;
1492
4.36k
        }
1493
55.4k
        copy->append_column(*col);
1494
55.4k
    }
1495
8.51k
    return copy;
1496
8.51k
}
1497
1498
// Dropped column is in _field_uniqueid_to_index but not in _field_name_to_index
1499
// Could refer to append_column method
1500
533k
bool TabletSchema::is_dropped_column(const TabletColumn& col) const {
1501
18.4E
    CHECK(_field_uniqueid_to_index.find(col.unique_id()) != _field_uniqueid_to_index.end())
1502
18.4E
            << "could not find col with unique id = " << col.unique_id()
1503
18.4E
            << " and name = " << col.name() << " table_id=" << _table_id;
1504
533k
    auto it = _field_name_to_index.find(StringRef {col.name()});
1505
535k
    return it == _field_name_to_index.end() || _cols[it->second]->unique_id() != col.unique_id();
1506
533k
}
1507
1508
108
void TabletSchema::copy_extracted_columns(const TabletSchema& src_schema) {
1509
108
    std::unordered_set<int32_t> variant_columns;
1510
364
    for (const auto& col : columns()) {
1511
364
        if (col->is_variant_type()) {
1512
225
            variant_columns.insert(col->unique_id());
1513
225
        }
1514
364
    }
1515
303
    for (const TabletColumnPtr& col : src_schema.columns()) {
1516
303
        if (col->is_extracted_column() && variant_columns.contains(col->parent_unique_id())) {
1517
0
            ColumnPB col_pb;
1518
0
            col->to_schema_pb(&col_pb);
1519
0
            TabletColumn new_col(col_pb);
1520
0
            append_column(new_col, ColumnType::VARIANT);
1521
0
        }
1522
303
    }
1523
108
}
1524
1525
107
void TabletSchema::reserve_extracted_columns() {
1526
650
    for (auto it = _cols.begin(); it != _cols.end();) {
1527
543
        if (!(*it)->is_extracted_column()) {
1528
219
            it = _cols.erase(it);
1529
324
        } else {
1530
324
            ++it;
1531
324
        }
1532
543
    }
1533
107
}
1534
1535
1.93M
void TabletSchema::to_schema_pb(TabletSchemaPB* tablet_schema_pb) const {
1536
1.93M
    for (const auto& i : _cluster_key_uids) {
1537
65.0k
        tablet_schema_pb->add_cluster_key_uids(i);
1538
65.0k
    }
1539
1.93M
    tablet_schema_pb->set_keys_type(_keys_type);
1540
24.5M
    for (const auto& col : _cols) {
1541
24.5M
        ColumnPB* column = tablet_schema_pb->add_column();
1542
24.5M
        col->to_schema_pb(column);
1543
24.5M
    }
1544
1.93M
    for (const auto& index : _indexes) {
1545
1.88M
        auto* index_pb = tablet_schema_pb->add_index();
1546
1.88M
        index->to_schema_pb(index_pb);
1547
1.88M
    }
1548
1.93M
    tablet_schema_pb->set_num_short_key_columns(cast_set<int32_t>(_num_short_key_columns));
1549
1.93M
    tablet_schema_pb->set_num_rows_per_row_block(cast_set<int32_t>(_num_rows_per_row_block));
1550
1.93M
    tablet_schema_pb->set_compress_kind(_compress_kind);
1551
1.93M
    if (_has_bf_fpp) {
1552
606k
        tablet_schema_pb->set_bf_fpp(_bf_fpp);
1553
606k
    }
1554
1.93M
    tablet_schema_pb->set_next_column_unique_id(cast_set<uint32_t>(_next_column_unique_id));
1555
1.93M
    tablet_schema_pb->set_is_in_memory(_is_in_memory);
1556
1.93M
    tablet_schema_pb->set_disable_auto_compaction(_disable_auto_compaction);
1557
1.93M
    tablet_schema_pb->set_enable_single_replica_compaction(_enable_single_replica_compaction);
1558
1.93M
    tablet_schema_pb->set_store_row_column(_store_row_column);
1559
1.93M
    tablet_schema_pb->set_skip_write_index_on_load(_skip_write_index_on_load);
1560
1.93M
    tablet_schema_pb->set_delete_sign_idx(_delete_sign_idx);
1561
1.93M
    tablet_schema_pb->set_sequence_col_idx(_sequence_col_idx);
1562
1.93M
    tablet_schema_pb->set_sort_type(_sort_type);
1563
1.93M
    tablet_schema_pb->set_sort_col_num(cast_set<int32_t>(_sort_col_num));
1564
1.93M
    tablet_schema_pb->set_schema_version(_schema_version);
1565
1.93M
    tablet_schema_pb->set_compression_type(_compression_type);
1566
1.93M
    tablet_schema_pb->set_row_store_page_size(_row_store_page_size);
1567
1.93M
    tablet_schema_pb->set_storage_page_size(_storage_page_size);
1568
1.93M
    tablet_schema_pb->set_storage_dict_page_size(_storage_dict_page_size);
1569
1.93M
    tablet_schema_pb->set_version_col_idx(_version_col_idx);
1570
1.93M
    tablet_schema_pb->set_skip_bitmap_col_idx(_skip_bitmap_col_idx);
1571
1.93M
    tablet_schema_pb->set_inverted_index_storage_format(_inverted_index_storage_format);
1572
1.93M
    tablet_schema_pb->mutable_row_store_column_unique_ids()->Assign(
1573
1.93M
            _row_store_column_unique_ids.begin(), _row_store_column_unique_ids.end());
1574
1.93M
    tablet_schema_pb->set_enable_variant_flatten_nested(_deprecated_enable_variant_flatten_nested);
1575
1.93M
    tablet_schema_pb->set_is_external_segment_column_meta_used(
1576
1.93M
            _is_external_segment_column_meta_used);
1577
1.93M
    tablet_schema_pb->set_integer_type_default_use_plain_encoding(
1578
1.93M
            _integer_type_default_use_plain_encoding);
1579
1.93M
    tablet_schema_pb->set_binary_plain_encoding_default_impl(_binary_plain_encoding_default_impl);
1580
1.93M
    auto column_groups_pb = tablet_schema_pb->mutable_seq_map();
1581
1.93M
    for (const auto& it : _seq_col_uid_to_value_cols_uid) {
1582
434
        uint32_t key = it.first;
1583
434
        ColumnGroupPB* cg_pb = column_groups_pb->add_cg(); // ColumnGroupPB {key: {v1, v2, v3}}
1584
434
        cg_pb->set_sequence_column(key);
1585
675
        for (auto v : it.second) {
1586
675
            cg_pb->add_columns_in_group(v);
1587
675
        }
1588
434
    }
1589
1.93M
}
1590
1591
12.3k
size_t TabletSchema::row_size() const {
1592
12.3k
    size_t size = 0;
1593
160k
    for (const auto& column : _cols) {
1594
160k
        size += column->length();
1595
160k
    }
1596
12.3k
    size += (_num_columns + 7) / 8;
1597
1598
12.3k
    return size;
1599
12.3k
}
1600
1601
11.2M
int32_t TabletSchema::field_index(const std::string& field_name) const {
1602
11.2M
    const auto& found = _field_name_to_index.find(StringRef(field_name));
1603
11.2M
    return (found == _field_name_to_index.end()) ? -1 : found->second;
1604
11.2M
}
1605
1606
15.6k
int32_t TabletSchema::field_index(const PathInData& path) const {
1607
15.6k
    const auto& found = _field_path_to_index.find(PathInDataRef(&path));
1608
15.6k
    return (found == _field_path_to_index.end()) ? -1 : found->second;
1609
15.6k
}
1610
1611
37.3M
int32_t TabletSchema::field_index(int32_t col_unique_id) const {
1612
37.3M
    const auto& found = _field_uniqueid_to_index.find(col_unique_id);
1613
37.3M
    return (found == _field_uniqueid_to_index.end()) ? -1 : found->second;
1614
37.3M
}
1615
1616
45.6M
const std::vector<TabletColumnPtr>& TabletSchema::columns() const {
1617
45.6M
    return _cols;
1618
45.6M
}
1619
1620
128M
const TabletColumn& TabletSchema::column(size_t ordinal) const {
1621
18.4E
    DCHECK(ordinal < _num_columns) << "ordinal:" << ordinal << ", _num_columns:" << _num_columns;
1622
128M
    return *_cols[ordinal];
1623
128M
}
1624
1625
1.94M
const TabletColumn& TabletSchema::column_by_uid(int32_t col_unique_id) const {
1626
1.94M
    return *_cols.at(_field_uniqueid_to_index.at(col_unique_id));
1627
1.94M
}
1628
1629
1
TabletColumn& TabletSchema::mutable_column_by_uid(int32_t col_unique_id) {
1630
1
    return *_cols.at(_field_uniqueid_to_index.at(col_unique_id));
1631
1
}
1632
1633
90.8k
TabletColumn& TabletSchema::mutable_column(size_t ordinal) {
1634
90.8k
    return *_cols.at(ordinal);
1635
90.8k
}
1636
1637
383k
void TabletSchema::update_indexes_from_thrift(const std::vector<doris::TOlapTableIndex>& tindexes) {
1638
383k
    std::vector<TabletIndexPtr> indexes;
1639
383k
    for (const auto& tindex : tindexes) {
1640
171k
        TabletIndex index;
1641
171k
        index.init_from_thrift(tindex, *this);
1642
171k
        indexes.emplace_back(std::make_shared<TabletIndex>(std::move(index)));
1643
171k
    }
1644
383k
    _indexes = std::move(indexes);
1645
383k
    _col_id_suffix_to_index.clear();
1646
383k
    _index_by_unique_id_with_pattern.clear();
1647
383k
    size_t index_pos = 0;
1648
383k
    for (auto& index : _indexes) {
1649
171k
        for (int32_t col_uid : index->col_unique_ids()) {
1650
171k
            if (auto field_pattern = index->field_pattern(); !field_pattern.empty()) {
1651
4.32k
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1652
4.32k
                pattern_to_index_map[field_pattern].emplace_back(index);
1653
166k
            } else {
1654
166k
                IndexKey key =
1655
166k
                        std::make_tuple(index->index_type(), col_uid, index->get_index_suffix());
1656
166k
                _col_id_suffix_to_index[key].push_back(index_pos);
1657
166k
            }
1658
171k
        }
1659
171k
        index_pos++;
1660
171k
    }
1661
383k
}
1662
1663
5.98k
bool TabletSchema::exist_column(const std::string& field_name) const {
1664
5.98k
    return _field_name_to_index.contains(StringRef {field_name});
1665
5.98k
}
1666
1667
28.1M
bool TabletSchema::has_column_unique_id(int32_t col_unique_id) const {
1668
28.1M
    return _field_uniqueid_to_index.contains(col_unique_id);
1669
28.1M
}
1670
1671
4.20k
Status TabletSchema::have_column(const std::string& field_name) const {
1672
4.20k
    if (!_field_name_to_index.contains(StringRef(field_name))) {
1673
4.04k
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
1674
4.04k
                "Not found field_name, field_name:{}, schema:{}", field_name,
1675
4.04k
                get_all_field_names());
1676
4.04k
    }
1677
154
    return Status::OK();
1678
4.20k
}
1679
1680
5.71k
Result<const TabletColumn*> TabletSchema::column(const std::string& field_name) const {
1681
5.71k
    auto it = _field_name_to_index.find(StringRef {field_name});
1682
5.71k
    if (it == _field_name_to_index.end()) {
1683
0
        DCHECK(false) << "field_name=" << field_name << ", table_id=" << _table_id
1684
0
                      << ", field_name_to_index=" << get_all_field_names();
1685
0
        return ResultError(
1686
0
                Status::InternalError("column not found, name={}, table_id={}, schema_version={}",
1687
0
                                      field_name, _table_id, _schema_version));
1688
0
    }
1689
5.71k
    return _cols[it->second].get();
1690
5.71k
}
1691
1692
void TabletSchema::update_tablet_columns(const TabletSchema& tablet_schema,
1693
13.8k
                                         const std::vector<TColumn>& t_columns) {
1694
13.8k
    copy_from(tablet_schema);
1695
13.8k
    if (!t_columns.empty() && t_columns[0].col_unique_id >= 0) {
1696
13.8k
        clear_columns();
1697
140k
        for (const auto& column : t_columns) {
1698
140k
            append_column(TabletColumn(column));
1699
140k
        }
1700
13.8k
    }
1701
13.8k
}
1702
1703
67
bool TabletSchema::has_inverted_index_with_index_id(int64_t index_id) const {
1704
86
    for (size_t i = 0; i < _indexes.size(); i++) {
1705
48
        if ((_indexes[i]->index_type() == IndexType::INVERTED ||
1706
48
             _indexes[i]->index_type() == IndexType::ANN) &&
1707
48
            _indexes[i]->index_id() == index_id) {
1708
29
            return true;
1709
29
        }
1710
48
    }
1711
38
    return false;
1712
67
}
1713
1714
std::vector<const TabletIndex*> TabletSchema::inverted_indexs(
1715
27.8M
        int32_t col_unique_id, const std::string& suffix_path) const {
1716
27.8M
    std::vector<const TabletIndex*> result;
1717
27.8M
    const std::string escaped_suffix = escape_for_path_name(suffix_path);
1718
27.8M
    auto it = _col_id_suffix_to_index.find(
1719
27.8M
            std::make_tuple(IndexType::INVERTED, col_unique_id, escaped_suffix));
1720
27.8M
    if (it != _col_id_suffix_to_index.end()) {
1721
157k
        for (size_t pos : it->second) {
1722
157k
            if (pos < _indexes.size()) {
1723
157k
                result.push_back(_indexes[pos].get());
1724
157k
            }
1725
157k
        }
1726
156k
    }
1727
27.8M
    return result;
1728
27.8M
}
1729
1730
std::vector<TabletIndexPtr> TabletSchema::inverted_index_by_field_pattern(
1731
9.01k
        int32_t col_unique_id, const std::string& field_pattern) const {
1732
9.01k
    auto id_to_pattern_map = _index_by_unique_id_with_pattern.find(col_unique_id);
1733
9.01k
    if (id_to_pattern_map == _index_by_unique_id_with_pattern.end()) {
1734
5.43k
        return {};
1735
5.43k
    }
1736
3.57k
    auto pattern_to_index_map = id_to_pattern_map->second.find(field_pattern);
1737
3.57k
    if (pattern_to_index_map == id_to_pattern_map->second.end()) {
1738
608
        return {};
1739
608
    }
1740
2.96k
    return pattern_to_index_map->second;
1741
3.57k
}
1742
1743
27.6M
std::vector<const TabletIndex*> TabletSchema::inverted_indexs(const TabletColumn& col) const {
1744
    // Some columns(Float, Double, JSONB ...) from the variant do not support inverted index
1745
27.6M
    if (!segment_v2::IndexColumnWriter::check_support_inverted_index(col)) {
1746
59.4k
        return {};
1747
59.4k
    }
1748
1749
    // TODO use more efficient impl
1750
    // Use parent id if unique not assigned, this could happend when accessing subcolumns of variants
1751
27.5M
    int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id();
1752
27.5M
    std::vector<const TabletIndex*> result;
1753
27.5M
    if (result = inverted_indexs(col_unique_id, escape_for_path_name(col.suffix_path()));
1754
27.5M
        !result.empty()) {
1755
123k
        return result;
1756
123k
    }
1757
    // variant's typed column has it's own index
1758
27.4M
    else if (col.is_extracted_column() && col.path_info_ptr()->get_is_typed()) {
1759
387
        std::string relative_path = col.path_info_ptr()->copy_pop_front().get_path();
1760
387
        if (_path_set_info_map.find(col_unique_id) == _path_set_info_map.end()) {
1761
0
            return result;
1762
0
        }
1763
387
        const auto& path_set_info = _path_set_info_map.at(col_unique_id);
1764
387
        if (path_set_info.typed_path_set.find(relative_path) ==
1765
387
            path_set_info.typed_path_set.end()) {
1766
0
            return result;
1767
0
        }
1768
387
        for (const auto& index : path_set_info.typed_path_set.at(relative_path).indexes) {
1769
45
            result.push_back(index.get());
1770
45
        }
1771
387
        return result;
1772
387
    }
1773
    // variant's subcolumns has it's own index
1774
27.4M
    else if (col.is_extracted_column()) {
1775
2.75k
        std::string relative_path = col.path_info_ptr()->copy_pop_front().get_path();
1776
2.75k
        if (_path_set_info_map.find(col_unique_id) == _path_set_info_map.end()) {
1777
1
            return result;
1778
1
        }
1779
2.75k
        const auto& path_set_info = _path_set_info_map.at(col_unique_id);
1780
2.75k
        if (path_set_info.subcolumn_indexes.find(relative_path) ==
1781
2.75k
            path_set_info.subcolumn_indexes.end()) {
1782
915
            return result;
1783
915
        }
1784
1.83k
        for (const auto& index : path_set_info.subcolumn_indexes.at(relative_path)) {
1785
42
            result.push_back(index.get());
1786
42
        }
1787
1.83k
    }
1788
27.4M
    return result;
1789
27.5M
}
1790
1791
const TabletIndex* TabletSchema::ann_index(int32_t col_unique_id,
1792
99.8k
                                           const std::string& suffix_path) const {
1793
206k
    for (size_t i = 0; i < _indexes.size(); i++) {
1794
106k
        if (_indexes[i]->index_type() == IndexType::ANN) {
1795
224
            for (int32_t id : _indexes[i]->col_unique_ids()) {
1796
224
                if (id == col_unique_id &&
1797
224
                    _indexes[i]->get_index_suffix() == escape_for_path_name(suffix_path)) {
1798
223
                    return _indexes[i].get();
1799
223
                }
1800
224
            }
1801
224
        }
1802
106k
    }
1803
99.5k
    return nullptr;
1804
99.8k
}
1805
1806
26.8M
const TabletIndex* TabletSchema::ann_index(const TabletColumn& col) const {
1807
26.8M
    if (!segment_v2::IndexColumnWriter::check_support_ann_index(col)) {
1808
26.7M
        return nullptr;
1809
26.7M
    }
1810
    // TODO use more efficient impl
1811
    // Use parent id if unique not assigned, this could happend when accessing subcolumns of variants
1812
99.4k
    int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id();
1813
99.4k
    return ann_index(col_unique_id, escape_for_path_name(col.suffix_path()));
1814
26.8M
}
1815
1816
0
bool TabletSchema::has_ngram_bf_index(int32_t col_unique_id) const {
1817
0
    IndexKey index_key(IndexType::NGRAM_BF, col_unique_id, "");
1818
0
    auto it = _col_id_suffix_to_index.find(index_key);
1819
0
    return it != _col_id_suffix_to_index.end();
1820
0
}
1821
1822
725k
const TabletIndex* TabletSchema::get_ngram_bf_index(int32_t col_unique_id) const {
1823
    // Get the ngram bf index for the given column unique id
1824
725k
    IndexKey index_key(IndexType::NGRAM_BF, col_unique_id, "");
1825
725k
    auto it = _col_id_suffix_to_index.find(index_key);
1826
725k
    if (it != _col_id_suffix_to_index.end()) {
1827
3.41k
        if (!it->second.empty() && it->second[0] < _indexes.size()) {
1828
3.41k
            return _indexes[it->second[0]].get();
1829
3.41k
        }
1830
3.41k
    }
1831
721k
    return nullptr;
1832
725k
}
1833
1834
const TabletIndex* TabletSchema::get_index(int32_t col_unique_id, IndexType index_type,
1835
14
                                           const std::string& suffix_path) const {
1836
14
    IndexKey index_key(index_type, col_unique_id, suffix_path);
1837
14
    auto it = _col_id_suffix_to_index.find(index_key);
1838
14
    if (it != _col_id_suffix_to_index.end()) {
1839
12
        if (!it->second.empty() && it->second[0] < _indexes.size()) {
1840
12
            return _indexes[it->second[0]].get();
1841
12
        }
1842
12
    }
1843
2
    return nullptr;
1844
14
}
1845
1846
Block TabletSchema::create_block(
1847
        const std::vector<uint32_t>& return_columns,
1848
3.53M
        const std::unordered_set<uint32_t>* tablet_columns_need_convert_null) const {
1849
3.53M
    Block block;
1850
47.3M
    for (int i = 0; i < return_columns.size(); ++i) {
1851
43.8M
        const ColumnId cid = return_columns[i];
1852
43.8M
        const auto& col = *_cols[cid];
1853
43.8M
        bool is_nullable = (tablet_columns_need_convert_null != nullptr &&
1854
43.8M
                            tablet_columns_need_convert_null->find(cid) !=
1855
43.7M
                                    tablet_columns_need_convert_null->end());
1856
43.8M
        auto data_type = DataTypeFactory::instance().create_data_type(col, is_nullable);
1857
43.8M
        if (col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT ||
1858
43.8M
            col.type() == FieldType::OLAP_FIELD_TYPE_MAP ||
1859
43.8M
            col.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) {
1860
153k
            if (_pruned_columns_data_type.contains(col.unique_id())) {
1861
148k
                data_type = _pruned_columns_data_type.at(col.unique_id());
1862
148k
            }
1863
153k
        }
1864
1865
43.8M
        if (_vir_col_idx_to_unique_id.contains(cid)) {
1866
297
            block.insert({ColumnNothing::create(0), data_type, col.name()});
1867
297
            VLOG_DEBUG << fmt::format(
1868
0
                    "Create block from tablet schema, column cid {} is virtual column, col_name: "
1869
0
                    "{}, col_unique_id: {}, type {}",
1870
0
                    cid, col.name(), col.unique_id(), data_type->get_name());
1871
43.8M
        } else {
1872
43.8M
            block.insert({data_type->create_column(), data_type, col.name()});
1873
43.8M
        }
1874
43.8M
    }
1875
3.53M
    return block;
1876
3.53M
}
1877
1878
43.8k
Block TabletSchema::create_block() const {
1879
43.8k
    Block block;
1880
536k
    for (const auto& col : _cols) {
1881
536k
        if (is_dropped_column(*col)) {
1882
13
            continue;
1883
13
        }
1884
1885
536k
        auto data_type = DataTypeFactory::instance().create_data_type(*col);
1886
536k
        if (col->type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
1887
1.05k
            if (_pruned_columns_data_type.contains(col->unique_id())) {
1888
0
                data_type = _pruned_columns_data_type.at(col->unique_id());
1889
0
            }
1890
1.05k
        }
1891
536k
        block.insert({data_type->create_column(), data_type, col->name()});
1892
536k
    }
1893
43.8k
    return block;
1894
43.8k
}
1895
1896
2.46k
Block TabletSchema::create_block_by_cids(const std::vector<uint32_t>& cids) const {
1897
2.46k
    Block block;
1898
16.9k
    for (const auto& cid : cids) {
1899
16.9k
        const auto& col = *_cols[cid];
1900
16.9k
        auto data_type = DataTypeFactory::instance().create_data_type(col);
1901
16.9k
        if (col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
1902
17
            if (_pruned_columns_data_type.contains(col.unique_id())) {
1903
0
                data_type = _pruned_columns_data_type.at(col.unique_id());
1904
0
            }
1905
17
        }
1906
16.9k
        block.insert({data_type->create_column(), data_type, col.name()});
1907
16.9k
    }
1908
2.46k
    return block;
1909
2.46k
}
1910
1911
830
bool operator==(const TabletColumn& a, const TabletColumn& b) {
1912
830
    if (a._unique_id != b._unique_id) return false;
1913
830
    if (a._col_name != b._col_name) return false;
1914
830
    if (a._type != b._type) return false;
1915
830
    if (a._is_key != b._is_key) return false;
1916
830
    if (a._aggregation != b._aggregation) return false;
1917
830
    if (a._is_nullable != b._is_nullable) return false;
1918
830
    if (a._has_default_value != b._has_default_value) return false;
1919
830
    if (a._has_default_value) {
1920
412
        if (a._default_value != b._default_value) return false;
1921
412
    }
1922
830
    if (a._is_decimal != b._is_decimal) return false;
1923
831
    if (a._is_decimal) {
1924
831
        if (a._precision != b._precision) return false;
1925
831
        if (a._frac != b._frac) return false;
1926
831
    }
1927
830
    if (a._length != b._length) return false;
1928
830
    if (a._index_length != b._index_length) return false;
1929
830
    if (a._is_bf_column != b._is_bf_column) return false;
1930
830
    if (a._column_path == nullptr && a._column_path != nullptr) return false;
1931
830
    if (b._column_path == nullptr && a._column_path != nullptr) return false;
1932
830
    if (b._column_path != nullptr && a._column_path != nullptr &&
1933
830
        *a._column_path != *b._column_path)
1934
0
        return false;
1935
830
    return true;
1936
830
}
1937
1938
833
bool operator!=(const TabletColumn& a, const TabletColumn& b) {
1939
833
    return !(a == b);
1940
833
}
1941
1942
108
bool operator==(const TabletSchema& a, const TabletSchema& b) {
1943
108
    if (a._keys_type != b._keys_type) return false;
1944
108
    if (a._cols.size() != b._cols.size()) return false;
1945
938
    for (int i = 0; i < a._cols.size(); ++i) {
1946
830
        if (*a._cols[i] != *b._cols[i]) return false;
1947
830
    }
1948
108
    if (a._num_columns != b._num_columns) return false;
1949
108
    if (a._num_key_columns != b._num_key_columns) return false;
1950
108
    if (a._num_null_columns != b._num_null_columns) return false;
1951
108
    if (a._num_short_key_columns != b._num_short_key_columns) return false;
1952
108
    if (a._num_rows_per_row_block != b._num_rows_per_row_block) return false;
1953
108
    if (a._compress_kind != b._compress_kind) return false;
1954
108
    if (a._next_column_unique_id != b._next_column_unique_id) return false;
1955
108
    if (a._has_bf_fpp != b._has_bf_fpp) return false;
1956
108
    if (a._has_bf_fpp) {
1957
9
        if (std::abs(a._bf_fpp - b._bf_fpp) > 1e-6) return false;
1958
9
    }
1959
108
    if (a._is_in_memory != b._is_in_memory) return false;
1960
108
    if (a._delete_sign_idx != b._delete_sign_idx) return false;
1961
108
    if (a._disable_auto_compaction != b._disable_auto_compaction) return false;
1962
108
    if (a._enable_single_replica_compaction != b._enable_single_replica_compaction) return false;
1963
108
    if (a._store_row_column != b._store_row_column) return false;
1964
108
    if (a._row_store_page_size != b._row_store_page_size) return false;
1965
108
    if (a._storage_page_size != b._storage_page_size) return false;
1966
108
    if (a._storage_dict_page_size != b._storage_dict_page_size) return false;
1967
108
    if (a._skip_write_index_on_load != b._skip_write_index_on_load) return false;
1968
108
    if (a._deprecated_enable_variant_flatten_nested !=
1969
108
        b._deprecated_enable_variant_flatten_nested) {
1970
0
        return false;
1971
0
    }
1972
108
    if (a._is_external_segment_column_meta_used != b._is_external_segment_column_meta_used)
1973
0
        return false;
1974
108
    if (a._integer_type_default_use_plain_encoding != b._integer_type_default_use_plain_encoding)
1975
0
        return false;
1976
108
    if (a._binary_plain_encoding_default_impl != b._binary_plain_encoding_default_impl)
1977
0
        return false;
1978
108
    return true;
1979
108
}
1980
1981
108
bool operator!=(const TabletSchema& a, const TabletSchema& b) {
1982
108
    return !(a == b);
1983
108
}
1984
#include "common/compile_check_end.h"
1985
} // namespace doris