Coverage Report

Created: 2026-06-24 16:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/tablet/tablet_schema.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/tablet/tablet_schema.h"
19
20
#include <gen_cpp/Descriptors_types.h>
21
#include <gen_cpp/olap_file.pb.h>
22
#include <glog/logging.h>
23
#include <google/protobuf/io/coded_stream.h>
24
#include <google/protobuf/io/zero_copy_stream.h>
25
#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
26
27
#include <algorithm>
28
#include <cctype>
29
// IWYU pragma: no_include <bits/std_abs.h>
30
#include <cmath> // IWYU pragma: keep
31
#include <memory>
32
#include <ostream>
33
#include <vector>
34
35
#include "common/compiler_util.h" // IWYU pragma: keep
36
#include "common/consts.h"
37
#include "common/status.h"
38
#include "core/block/block.h"
39
#include "core/column/column_nothing.h"
40
#include "core/data_type/data_type.h"
41
#include "core/data_type/data_type_factory.hpp"
42
#include "core/string_ref.h"
43
#include "exec/common/hex.h"
44
#include "exprs/aggregate/aggregate_function_simple_factory.h"
45
#include "exprs/aggregate/aggregate_function_state_union.h"
46
#include "storage/index/inverted/analyzer/analyzer.h"
47
#include "storage/index/inverted/inverted_index_parser.h"
48
#include "storage/olap_common.h"
49
#include "storage/olap_define.h"
50
#include "storage/tablet/tablet_column_object_pool.h"
51
#include "storage/tablet/tablet_meta.h"
52
#include "storage/tablet_info.h"
53
#include "storage/types.h"
54
#include "storage/utils.h"
55
#include "util/json/path_in_data.h"
56
57
namespace doris {
58
33.6M
FieldType TabletColumn::get_field_type_by_type(PrimitiveType primitiveType) {
59
33.6M
    switch (primitiveType) {
60
0
    case PrimitiveType::INVALID_TYPE:
61
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN;
62
0
    case PrimitiveType::TYPE_NULL:
63
0
        return FieldType::OLAP_FIELD_TYPE_NONE;
64
269k
    case PrimitiveType::TYPE_BOOLEAN:
65
269k
        return FieldType::OLAP_FIELD_TYPE_BOOL;
66
3.59M
    case PrimitiveType::TYPE_TINYINT:
67
3.59M
        return FieldType::OLAP_FIELD_TYPE_TINYINT;
68
11.7k
    case PrimitiveType::TYPE_SMALLINT:
69
11.7k
        return FieldType::OLAP_FIELD_TYPE_SMALLINT;
70
297k
    case PrimitiveType::TYPE_INT:
71
297k
        return FieldType::OLAP_FIELD_TYPE_INT;
72
10.8M
    case PrimitiveType::TYPE_BIGINT:
73
10.8M
        return FieldType::OLAP_FIELD_TYPE_BIGINT;
74
75.4k
    case PrimitiveType::TYPE_LARGEINT:
75
75.4k
        return FieldType::OLAP_FIELD_TYPE_LARGEINT;
76
3.46k
    case PrimitiveType::TYPE_FLOAT:
77
3.46k
        return FieldType::OLAP_FIELD_TYPE_FLOAT;
78
2.75M
    case PrimitiveType::TYPE_DOUBLE:
79
2.75M
        return FieldType::OLAP_FIELD_TYPE_DOUBLE;
80
12.3M
    case PrimitiveType::TYPE_VARCHAR:
81
12.3M
        return FieldType::OLAP_FIELD_TYPE_VARCHAR;
82
2.36k
    case PrimitiveType::TYPE_DATE:
83
2.36k
        return FieldType::OLAP_FIELD_TYPE_DATE;
84
2.21k
    case PrimitiveType::TYPE_DATETIME:
85
2.21k
        return FieldType::OLAP_FIELD_TYPE_DATETIME;
86
0
    case PrimitiveType::TYPE_BINARY:
87
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN; // Not implemented
88
15.4k
    case PrimitiveType::TYPE_CHAR:
89
15.4k
        return FieldType::OLAP_FIELD_TYPE_CHAR;
90
0
    case PrimitiveType::TYPE_STRUCT:
91
0
        return FieldType::OLAP_FIELD_TYPE_STRUCT;
92
2.47k
    case PrimitiveType::TYPE_ARRAY:
93
2.47k
        return FieldType::OLAP_FIELD_TYPE_ARRAY;
94
1
    case PrimitiveType::TYPE_MAP:
95
1
        return FieldType::OLAP_FIELD_TYPE_MAP;
96
4
    case PrimitiveType::TYPE_HLL:
97
4
        return FieldType::OLAP_FIELD_TYPE_HLL;
98
0
    case PrimitiveType::TYPE_DECIMALV2:
99
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN; // Not implemented
100
4
    case PrimitiveType::TYPE_BITMAP:
101
4
        return FieldType::OLAP_FIELD_TYPE_BITMAP;
102
1.68M
    case PrimitiveType::TYPE_STRING:
103
1.68M
        return FieldType::OLAP_FIELD_TYPE_STRING;
104
4
    case PrimitiveType::TYPE_QUANTILE_STATE:
105
4
        return FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE;
106
29.0k
    case PrimitiveType::TYPE_DATEV2:
107
29.0k
        return FieldType::OLAP_FIELD_TYPE_DATEV2;
108
1.61M
    case PrimitiveType::TYPE_DATETIMEV2:
109
1.61M
        return FieldType::OLAP_FIELD_TYPE_DATETIMEV2;
110
4.86k
    case PrimitiveType::TYPE_TIMESTAMPTZ:
111
4.86k
        return FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ;
112
0
    case PrimitiveType::TYPE_TIMEV2:
113
0
        return FieldType::OLAP_FIELD_TYPE_TIMEV2;
114
626
    case PrimitiveType::TYPE_DECIMAL32:
115
626
        return FieldType::OLAP_FIELD_TYPE_DECIMAL32;
116
628
    case PrimitiveType::TYPE_DECIMAL64:
117
628
        return FieldType::OLAP_FIELD_TYPE_DECIMAL64;
118
240k
    case PrimitiveType::TYPE_DECIMAL128I:
119
240k
        return FieldType::OLAP_FIELD_TYPE_DECIMAL128I;
120
611
    case PrimitiveType::TYPE_DECIMAL256:
121
611
        return FieldType::OLAP_FIELD_TYPE_DECIMAL256;
122
1.87k
    case PrimitiveType::TYPE_JSONB:
123
1.87k
        return FieldType::OLAP_FIELD_TYPE_JSONB;
124
1
    case PrimitiveType::TYPE_VARIANT:
125
1
        return FieldType::OLAP_FIELD_TYPE_VARIANT;
126
391
    case PrimitiveType::TYPE_IPV4:
127
391
        return FieldType::OLAP_FIELD_TYPE_IPV4;
128
394
    case PrimitiveType::TYPE_IPV6:
129
394
        return FieldType::OLAP_FIELD_TYPE_IPV6;
130
3
    case PrimitiveType::TYPE_AGG_STATE:
131
3
        return FieldType::OLAP_FIELD_TYPE_AGG_STATE;
132
0
    default:
133
0
        return FieldType::OLAP_FIELD_TYPE_UNKNOWN;
134
33.6M
    }
135
33.6M
}
136
137
25.1M
PrimitiveType TabletColumn::get_primitive_type_by_field_type(FieldType type) {
138
25.1M
    static const PrimitiveType mapping[] = {
139
25.1M
            /*  0 */ PrimitiveType::INVALID_TYPE,
140
25.1M
            /*  1 OLAP_FIELD_TYPE_TINYINT           */ PrimitiveType::TYPE_TINYINT,
141
25.1M
            /*  2 OLAP_FIELD_TYPE_UNSIGNED_TINYINT  */ PrimitiveType::INVALID_TYPE,
142
25.1M
            /*  3 OLAP_FIELD_TYPE_SMALLINT          */ PrimitiveType::TYPE_SMALLINT,
143
25.1M
            /*  4 OLAP_FIELD_TYPE_UNSIGNED_SMALLINT */ PrimitiveType::INVALID_TYPE,
144
25.1M
            /*  5 OLAP_FIELD_TYPE_INT               */ PrimitiveType::TYPE_INT,
145
25.1M
            /*  6 OLAP_FIELD_TYPE_UNSIGNED_INT      */ PrimitiveType::INVALID_TYPE,
146
25.1M
            /*  7 OLAP_FIELD_TYPE_BIGINT            */ PrimitiveType::TYPE_BIGINT,
147
25.1M
            /*  8 OLAP_FIELD_TYPE_UNSIGNED_BIGINT   */ PrimitiveType::INVALID_TYPE,
148
25.1M
            /*  9 OLAP_FIELD_TYPE_LARGEINT          */ PrimitiveType::TYPE_LARGEINT,
149
25.1M
            /* 10 OLAP_FIELD_TYPE_FLOAT             */ PrimitiveType::TYPE_FLOAT,
150
25.1M
            /* 11 OLAP_FIELD_TYPE_DOUBLE            */ PrimitiveType::TYPE_DOUBLE,
151
25.1M
            /* 12 OLAP_FIELD_TYPE_DISCRETE_DOUBLE   */ PrimitiveType::INVALID_TYPE,
152
25.1M
            /* 13 OLAP_FIELD_TYPE_CHAR              */ PrimitiveType::TYPE_CHAR,
153
25.1M
            /* 14 OLAP_FIELD_TYPE_DATE              */ PrimitiveType::TYPE_DATE,
154
25.1M
            /* 15 OLAP_FIELD_TYPE_DATETIME          */ PrimitiveType::TYPE_DATETIME,
155
25.1M
            /* 16 OLAP_FIELD_TYPE_DECIMAL           */ PrimitiveType::INVALID_TYPE,
156
25.1M
            /* 17 OLAP_FIELD_TYPE_VARCHAR           */ PrimitiveType::TYPE_VARCHAR,
157
25.1M
            /* 18 OLAP_FIELD_TYPE_STRUCT            */ PrimitiveType::TYPE_STRUCT,
158
25.1M
            /* 19 OLAP_FIELD_TYPE_ARRAY             */ PrimitiveType::TYPE_ARRAY,
159
25.1M
            /* 20 OLAP_FIELD_TYPE_MAP               */ PrimitiveType::TYPE_MAP,
160
25.1M
            /* 21 OLAP_FIELD_TYPE_UNKNOWN           */ PrimitiveType::INVALID_TYPE,
161
25.1M
            /* 22 OLAP_FIELD_TYPE_NONE              */ PrimitiveType::TYPE_NULL,
162
25.1M
            /* 23 OLAP_FIELD_TYPE_HLL               */ PrimitiveType::TYPE_HLL,
163
25.1M
            /* 24 OLAP_FIELD_TYPE_BOOL              */ PrimitiveType::TYPE_BOOLEAN,
164
25.1M
            /* 25 OLAP_FIELD_TYPE_BITMAP            */ PrimitiveType::TYPE_BITMAP,
165
25.1M
            /* 26 OLAP_FIELD_TYPE_STRING            */ PrimitiveType::TYPE_STRING,
166
25.1M
            /* 27 OLAP_FIELD_TYPE_QUANTILE_STATE    */ PrimitiveType::TYPE_QUANTILE_STATE,
167
25.1M
            /* 28 OLAP_FIELD_TYPE_DATEV2            */ PrimitiveType::TYPE_DATEV2,
168
25.1M
            /* 29 OLAP_FIELD_TYPE_DATETIMEV2        */ PrimitiveType::TYPE_DATETIMEV2,
169
25.1M
            /* 30 OLAP_FIELD_TYPE_TIMEV2            */ PrimitiveType::TYPE_TIMEV2,
170
25.1M
            /* 31 OLAP_FIELD_TYPE_DECIMAL32         */ PrimitiveType::TYPE_DECIMAL32,
171
25.1M
            /* 32 OLAP_FIELD_TYPE_DECIMAL64         */ PrimitiveType::TYPE_DECIMAL64,
172
25.1M
            /* 33 OLAP_FIELD_TYPE_DECIMAL128I       */ PrimitiveType::TYPE_DECIMAL128I,
173
25.1M
            /* 34 OLAP_FIELD_TYPE_JSONB             */ PrimitiveType::TYPE_JSONB,
174
25.1M
            /* 35 OLAP_FIELD_TYPE_VARIANT           */ PrimitiveType::TYPE_VARIANT,
175
25.1M
            /* 36 OLAP_FIELD_TYPE_AGG_STATE         */ PrimitiveType::TYPE_AGG_STATE,
176
25.1M
            /* 37 OLAP_FIELD_TYPE_DECIMAL256        */ PrimitiveType::TYPE_DECIMAL256,
177
25.1M
            /* 38 OLAP_FIELD_TYPE_IPV4              */ PrimitiveType::TYPE_IPV4,
178
25.1M
            /* 39 OLAP_FIELD_TYPE_IPV6              */ PrimitiveType::TYPE_IPV6,
179
25.1M
            /* 40 OLAP_FIELD_TYPE_TIMESTAMPTZ       */ PrimitiveType::TYPE_TIMESTAMPTZ,
180
25.1M
    };
181
182
25.1M
    int idx = static_cast<int>(type);
183
25.1M
    return mapping[idx];
184
25.1M
}
185
186
56.7M
FieldType TabletColumn::get_field_type_by_string(const std::string& type_str) {
187
56.7M
    std::string upper_type_str = type_str;
188
56.7M
    std::transform(type_str.begin(), type_str.end(), upper_type_str.begin(),
189
368M
                   [](auto c) { return std::toupper(c); });
190
56.7M
    FieldType type;
191
192
56.7M
    if (0 == upper_type_str.compare("TINYINT")) {
193
3.91M
        type = FieldType::OLAP_FIELD_TYPE_TINYINT;
194
52.8M
    } else if (0 == upper_type_str.compare("SMALLINT")) {
195
715k
        type = FieldType::OLAP_FIELD_TYPE_SMALLINT;
196
52.1M
    } else if (0 == upper_type_str.compare("INT")) {
197
4.52M
        type = FieldType::OLAP_FIELD_TYPE_INT;
198
47.6M
    } else if (0 == upper_type_str.compare("BIGINT")) {
199
10.3M
        type = FieldType::OLAP_FIELD_TYPE_BIGINT;
200
37.2M
    } else if (0 == upper_type_str.compare("LARGEINT")) {
201
736k
        type = FieldType::OLAP_FIELD_TYPE_LARGEINT;
202
36.4M
    } else if (0 == upper_type_str.compare("UNSIGNED_TINYINT")) {
203
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_TINYINT;
204
36.4M
    } else if (0 == upper_type_str.compare("UNSIGNED_SMALLINT")) {
205
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_SMALLINT;
206
36.4M
    } else if (0 == upper_type_str.compare("UNSIGNED_INT")) {
207
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT;
208
36.4M
    } else if (0 == upper_type_str.compare("UNSIGNED_BIGINT")) {
209
0
        type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
210
36.4M
    } else if (0 == upper_type_str.compare("IPV4")) {
211
37.8k
        type = FieldType::OLAP_FIELD_TYPE_IPV4;
212
36.4M
    } else if (0 == upper_type_str.compare("IPV6")) {
213
37.9k
        type = FieldType::OLAP_FIELD_TYPE_IPV6;
214
36.4M
    } else if (0 == upper_type_str.compare("FLOAT")) {
215
600k
        type = FieldType::OLAP_FIELD_TYPE_FLOAT;
216
35.8M
    } else if (0 == upper_type_str.compare("DISCRETE_DOUBLE")) {
217
0
        type = FieldType::OLAP_FIELD_TYPE_DISCRETE_DOUBLE;
218
35.8M
    } else if (0 == upper_type_str.compare("DOUBLE")) {
219
978k
        type = FieldType::OLAP_FIELD_TYPE_DOUBLE;
220
34.8M
    } else if (0 == upper_type_str.compare("CHAR")) {
221
897k
        type = FieldType::OLAP_FIELD_TYPE_CHAR;
222
33.9M
    } else if (0 == upper_type_str.compare("DATE")) {
223
21.8k
        type = FieldType::OLAP_FIELD_TYPE_DATE;
224
33.9M
    } else if (0 == upper_type_str.compare("DATEV2")) {
225
2.11M
        type = FieldType::OLAP_FIELD_TYPE_DATEV2;
226
31.8M
    } else if (0 == upper_type_str.compare("DATETIMEV2")) {
227
3.05M
        type = FieldType::OLAP_FIELD_TYPE_DATETIMEV2;
228
28.7M
    } else if (0 == upper_type_str.compare("DATETIME")) {
229
24.3k
        type = FieldType::OLAP_FIELD_TYPE_DATETIME;
230
28.7M
    } else if (0 == upper_type_str.compare("TIMESTAMPTZ")) {
231
102k
        type = FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ;
232
28.6M
    } else if (0 == upper_type_str.compare("DECIMAL32")) {
233
484k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL32;
234
28.1M
    } else if (0 == upper_type_str.compare("DECIMAL64")) {
235
1.42M
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL64;
236
26.7M
    } else if (0 == upper_type_str.compare("DECIMAL128I")) {
237
753k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL128I;
238
25.9M
    } else if (0 == upper_type_str.compare("DECIMAL256")) {
239
84.1k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL256;
240
25.8M
    } else if (0 == upper_type_str.compare(0, 7, "DECIMAL")) {
241
48.0k
        type = FieldType::OLAP_FIELD_TYPE_DECIMAL;
242
25.8M
    } else if (0 == upper_type_str.compare(0, 7, "VARCHAR")) {
243
18.7M
        type = FieldType::OLAP_FIELD_TYPE_VARCHAR;
244
18.7M
    } else if (0 == upper_type_str.compare("STRING")) {
245
3.00M
        type = FieldType::OLAP_FIELD_TYPE_STRING;
246
4.06M
    } else if (0 == upper_type_str.compare("JSONB")) {
247
217k
        type = FieldType::OLAP_FIELD_TYPE_JSONB;
248
3.84M
    } else if (0 == upper_type_str.compare("VARIANT")) {
249
78.9k
        type = FieldType::OLAP_FIELD_TYPE_VARIANT;
250
3.76M
    } else if (0 == upper_type_str.compare("BOOLEAN")) {
251
549k
        type = FieldType::OLAP_FIELD_TYPE_BOOL;
252
3.21M
    } else if (0 == upper_type_str.compare(0, 3, "HLL")) {
253
26.0k
        type = FieldType::OLAP_FIELD_TYPE_HLL;
254
3.19M
    } else if (0 == upper_type_str.compare("STRUCT")) {
255
112k
        type = FieldType::OLAP_FIELD_TYPE_STRUCT;
256
3.07M
    } else if (0 == upper_type_str.compare("LIST")) {
257
0
        type = FieldType::OLAP_FIELD_TYPE_ARRAY;
258
3.07M
    } else if (0 == upper_type_str.compare("MAP")) {
259
1.51M
        type = FieldType::OLAP_FIELD_TYPE_MAP;
260
1.56M
    } else if (0 == upper_type_str.compare("OBJECT")) {
261
14.2k
        type = FieldType::OLAP_FIELD_TYPE_BITMAP;
262
1.55M
    } else if (0 == upper_type_str.compare("BITMAP")) {
263
30.7k
        type = FieldType::OLAP_FIELD_TYPE_BITMAP;
264
1.72M
    } else if (0 == upper_type_str.compare("ARRAY")) {
265
1.72M
        type = FieldType::OLAP_FIELD_TYPE_ARRAY;
266
18.4E
    } else if (0 == upper_type_str.compare("QUANTILE_STATE")) {
267
18.5k
        type = FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE;
268
18.4E
    } else if (0 == upper_type_str.compare("AGG_STATE")) {
269
28.0k
        type = FieldType::OLAP_FIELD_TYPE_AGG_STATE;
270
18.4E
    } else {
271
18.4E
        LOG(WARNING) << "invalid type string. [type='" << type_str << "']";
272
18.4E
        type = FieldType::OLAP_FIELD_TYPE_UNKNOWN;
273
18.4E
    }
274
275
56.7M
    return type;
276
56.7M
}
277
278
56.1M
FieldAggregationMethod TabletColumn::get_aggregation_type_by_string(const std::string& str) {
279
56.1M
    std::string upper_str = str;
280
56.1M
    std::transform(str.begin(), str.end(), upper_str.begin(),
281
229M
                   [](auto c) { return std::toupper(c); });
282
56.1M
    FieldAggregationMethod aggregation_type;
283
284
56.1M
    if (0 == upper_str.compare("NONE")) {
285
55.0M
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE;
286
55.0M
    } else if (0 == upper_str.compare("SUM")) {
287
360k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_SUM;
288
771k
    } else if (0 == upper_str.compare("MIN")) {
289
18.9k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MIN;
290
752k
    } else if (0 == upper_str.compare("MAX")) {
291
90.0k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MAX;
292
662k
    } else if (0 == upper_str.compare("REPLACE")) {
293
652k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE;
294
652k
    } else if (0 == upper_str.compare("REPLACE_IF_NOT_NULL")) {
295
156k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL;
296
18.4E
    } else if (0 == upper_str.compare("HLL_UNION")) {
297
23.7k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_HLL_UNION;
298
18.4E
    } else if (0 == upper_str.compare("BITMAP_UNION")) {
299
32.7k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_BITMAP_UNION;
300
18.4E
    } else if (0 == upper_str.compare("QUANTILE_UNION")) {
301
17.9k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_QUANTILE_UNION;
302
18.4E
    } else if (!upper_str.empty()) {
303
13.8k
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_GENERIC;
304
18.4E
    } else {
305
18.4E
        aggregation_type = FieldAggregationMethod::OLAP_FIELD_AGGREGATION_UNKNOWN;
306
18.4E
    }
307
308
56.1M
    return aggregation_type;
309
56.1M
}
310
311
51.9M
std::string TabletColumn::get_string_by_field_type(FieldType type) {
312
51.9M
    switch (type) {
313
3.46M
    case FieldType::OLAP_FIELD_TYPE_TINYINT:
314
3.46M
        return "TINYINT";
315
316
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
317
0
        return "UNSIGNED_TINYINT";
318
319
833k
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
320
833k
        return "SMALLINT";
321
322
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
323
0
        return "UNSIGNED_SMALLINT";
324
325
5.62M
    case FieldType::OLAP_FIELD_TYPE_INT:
326
5.62M
        return "INT";
327
328
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT:
329
0
        return "UNSIGNED_INT";
330
331
8.60M
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
332
8.60M
        return "BIGINT";
333
334
957k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
335
957k
        return "LARGEINT";
336
337
0
    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
338
0
        return "UNSIGNED_BIGINT";
339
340
33.3k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
341
33.3k
        return "IPV4";
342
343
33.8k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
344
33.8k
        return "IPV6";
345
346
672k
    case FieldType::OLAP_FIELD_TYPE_FLOAT:
347
672k
        return "FLOAT";
348
349
903k
    case FieldType::OLAP_FIELD_TYPE_DOUBLE:
350
903k
        return "DOUBLE";
351
352
0
    case FieldType::OLAP_FIELD_TYPE_DISCRETE_DOUBLE:
353
0
        return "DISCRETE_DOUBLE";
354
355
877k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
356
877k
        return "CHAR";
357
358
15.4k
    case FieldType::OLAP_FIELD_TYPE_DATE:
359
15.4k
        return "DATE";
360
361
2.32M
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
362
2.32M
        return "DATEV2";
363
364
19.5k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
365
19.5k
        return "DATETIME";
366
367
2.79M
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
368
2.79M
        return "DATETIMEV2";
369
370
177k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
371
177k
        return "TIMESTAMPTZ";
372
373
30.6k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
374
30.6k
        return "DECIMAL";
375
376
571k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
377
571k
        return "DECIMAL32";
378
379
1.28M
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
380
1.28M
        return "DECIMAL64";
381
382
891k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
383
891k
        return "DECIMAL128I";
384
385
84.3k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
386
84.3k
        return "DECIMAL256";
387
388
14.7M
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
389
14.7M
        return "VARCHAR";
390
391
347k
    case FieldType::OLAP_FIELD_TYPE_JSONB:
392
347k
        return "JSONB";
393
394
134k
    case FieldType::OLAP_FIELD_TYPE_VARIANT:
395
134k
        return "VARIANT";
396
397
3.01M
    case FieldType::OLAP_FIELD_TYPE_STRING:
398
3.01M
        return "STRING";
399
400
606k
    case FieldType::OLAP_FIELD_TYPE_BOOL:
401
606k
        return "BOOLEAN";
402
403
63.2k
    case FieldType::OLAP_FIELD_TYPE_HLL:
404
63.2k
        return "HLL";
405
406
133k
    case FieldType::OLAP_FIELD_TYPE_STRUCT:
407
133k
        return "STRUCT";
408
409
1.70M
    case FieldType::OLAP_FIELD_TYPE_ARRAY:
410
1.70M
        return "ARRAY";
411
412
1.00M
    case FieldType::OLAP_FIELD_TYPE_MAP:
413
1.00M
        return "MAP";
414
415
84.1k
    case FieldType::OLAP_FIELD_TYPE_BITMAP:
416
84.1k
        return "OBJECT";
417
46.0k
    case FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE:
418
46.0k
        return "QUANTILE_STATE";
419
16.8k
    case FieldType::OLAP_FIELD_TYPE_AGG_STATE:
420
16.8k
        return "AGG_STATE";
421
0
    default:
422
0
        return "UNKNOWN";
423
51.9M
    }
424
51.9M
}
425
426
198k
std::string TabletColumn::get_string_by_aggregation_type(FieldAggregationMethod type) {
427
198k
    switch (type) {
428
129k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE:
429
129k
        return "NONE";
430
431
13.4k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_SUM:
432
13.4k
        return "SUM";
433
434
1.84k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MIN:
435
1.84k
        return "MIN";
436
437
5.49k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_MAX:
438
5.49k
        return "MAX";
439
440
29.5k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE:
441
29.5k
        return "REPLACE";
442
443
14.3k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL:
444
14.3k
        return "REPLACE_IF_NOT_NULL";
445
446
1.29k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_HLL_UNION:
447
1.29k
        return "HLL_UNION";
448
449
1.79k
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_BITMAP_UNION:
450
1.79k
        return "BITMAP_UNION";
451
452
972
    case FieldAggregationMethod::OLAP_FIELD_AGGREGATION_QUANTILE_UNION:
453
972
        return "QUANTILE_UNION";
454
455
2
    default:
456
2
        return "UNKNOWN";
457
198k
    }
458
198k
}
459
460
25.8M
uint32_t TabletColumn::get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length) {
461
25.8M
    switch (type) {
462
1.88M
    case TPrimitiveType::TINYINT:
463
2.11M
    case TPrimitiveType::BOOLEAN:
464
2.11M
        return 1;
465
282k
    case TPrimitiveType::SMALLINT:
466
282k
        return 2;
467
1.61M
    case TPrimitiveType::INT:
468
1.61M
        return 4;
469
4.96M
    case TPrimitiveType::BIGINT:
470
4.96M
        return 8;
471
284k
    case TPrimitiveType::LARGEINT:
472
284k
        return 16;
473
16.7k
    case TPrimitiveType::IPV4:
474
16.7k
        return 4;
475
16.9k
    case TPrimitiveType::IPV6:
476
16.9k
        return 16;
477
10.3k
    case TPrimitiveType::DATE:
478
10.3k
        return 3;
479
777k
    case TPrimitiveType::DATEV2:
480
777k
        return 4;
481
10.1k
    case TPrimitiveType::DATETIME:
482
10.1k
        return 8;
483
1.40M
    case TPrimitiveType::DATETIMEV2:
484
1.44M
    case TPrimitiveType::TIMESTAMPTZ:
485
1.44M
        return 8;
486
248k
    case TPrimitiveType::FLOAT:
487
248k
        return 4;
488
438k
    case TPrimitiveType::DOUBLE:
489
438k
        return 8;
490
4.72k
    case TPrimitiveType::QUANTILE_STATE:
491
17.8k
    case TPrimitiveType::BITMAP:
492
17.8k
        return 16;
493
382k
    case TPrimitiveType::CHAR:
494
382k
        return string_length;
495
9.06M
    case TPrimitiveType::VARCHAR:
496
9.06M
    case TPrimitiveType::HLL:
497
9.07M
    case TPrimitiveType::AGG_STATE:
498
9.07M
        return string_length + sizeof(OLAP_VARCHAR_MAX_LENGTH);
499
1.43M
    case TPrimitiveType::STRING:
500
1.45M
    case TPrimitiveType::VARIANT:
501
1.45M
        return string_length + sizeof(OLAP_STRING_MAX_LENGTH);
502
84.3k
    case TPrimitiveType::JSONB:
503
84.3k
        return string_length + sizeof(OLAP_JSONB_MAX_LENGTH);
504
42.2k
    case TPrimitiveType::STRUCT:
505
        // Note that(xy): this is the length of struct type itself,
506
        // the length of its subtypes are not included.
507
42.2k
        return OLAP_STRUCT_MAX_LENGTH;
508
781k
    case TPrimitiveType::ARRAY:
509
781k
        return OLAP_ARRAY_MAX_LENGTH;
510
732k
    case TPrimitiveType::MAP:
511
732k
        return OLAP_MAP_MAX_LENGTH;
512
158k
    case TPrimitiveType::DECIMAL32:
513
158k
        return 4;
514
586k
    case TPrimitiveType::DECIMAL64:
515
586k
        return 8;
516
291k
    case TPrimitiveType::DECIMAL128I:
517
291k
        return 16;
518
26.7k
    case TPrimitiveType::DECIMAL256:
519
26.7k
        return 32;
520
25.4k
    case TPrimitiveType::DECIMALV2:
521
25.4k
        return 12; // use 12 bytes in olap engine.
522
0
    default:
523
0
        LOG(WARNING) << "unknown field type. [type=" << type << "]";
524
0
        return 0;
525
25.8M
    }
526
25.8M
}
527
528
9
bool TabletColumn::has_char_type() const {
529
9
    switch (_type) {
530
3
    case FieldType::OLAP_FIELD_TYPE_CHAR: {
531
3
        return true;
532
0
    }
533
4
    case FieldType::OLAP_FIELD_TYPE_ARRAY:
534
4
    case FieldType::OLAP_FIELD_TYPE_MAP:
535
4
    case FieldType::OLAP_FIELD_TYPE_STRUCT: {
536
4
        return std::any_of(_sub_columns.begin(), _sub_columns.end(),
537
4
                           [&](const auto& sub) -> bool { return sub->has_char_type(); });
538
4
    }
539
2
    default:
540
2
        return false;
541
9
    }
542
9
}
543
544
34.3M
TabletColumn::TabletColumn() : _aggregation(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE) {}
545
546
20
TabletColumn::TabletColumn(FieldAggregationMethod agg, FieldType type) {
547
20
    _aggregation = agg;
548
20
    _type = type;
549
20
}
550
551
20
TabletColumn::TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable) {
552
20
    _aggregation = agg;
553
20
    _type = filed_type;
554
20
    _length = cast_set<int32_t>(field_type_size(filed_type));
555
20
    _is_nullable = is_nullable;
556
20
}
557
558
TabletColumn::TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable,
559
113k
                           int32_t unique_id, size_t length) {
560
113k
    _aggregation = agg;
561
113k
    _type = filed_type;
562
113k
    _is_nullable = is_nullable;
563
113k
    _unique_id = unique_id;
564
113k
    _length = cast_set<int32_t>(length);
565
113k
}
566
567
5.68k
TabletColumn::TabletColumn(const ColumnPB& column) {
568
5.68k
    init_from_pb(column);
569
5.68k
}
570
571
22.4M
TabletColumn::TabletColumn(const TColumn& column) {
572
22.4M
    init_from_thrift(column);
573
22.4M
}
574
575
23.3M
void TabletColumn::init_from_thrift(const TColumn& tcolumn) {
576
23.3M
    ColumnPB column_pb;
577
23.3M
    TabletMeta::init_column_from_tcolumn(tcolumn.col_unique_id, tcolumn, &column_pb);
578
23.3M
    init_from_pb(column_pb);
579
23.3M
}
580
581
56.1M
void TabletColumn::init_from_pb(const ColumnPB& column) {
582
56.1M
    _unique_id = column.unique_id();
583
56.1M
    _col_name = column.name();
584
56.1M
    _col_name_lower_case = to_lower(_col_name);
585
56.1M
    _type = TabletColumn::get_field_type_by_string(column.type());
586
56.1M
    _is_key = column.is_key();
587
56.1M
    _is_nullable = column.is_nullable();
588
56.1M
    _is_auto_increment = column.is_auto_increment();
589
56.1M
    if (column.has_is_on_update_current_timestamp()) {
590
51.0M
        _is_on_update_current_timestamp = column.is_on_update_current_timestamp();
591
51.0M
    }
592
593
56.1M
    _has_default_value = column.has_default_value();
594
56.1M
    if (_has_default_value) {
595
5.77M
        _default_value = column.default_value();
596
5.77M
    }
597
598
56.3M
    if (column.has_precision()) {
599
56.3M
        _is_decimal = true;
600
56.3M
        _precision = column.precision();
601
18.4E
    } else {
602
18.4E
        _is_decimal = false;
603
18.4E
    }
604
56.3M
    if (column.has_frac()) {
605
56.3M
        _frac = column.frac();
606
56.3M
    }
607
56.1M
    _length = column.length();
608
56.1M
    _index_length = column.index_length();
609
56.1M
    if (column.has_is_bf_column()) {
610
5.38M
        _is_bf_column = column.is_bf_column();
611
50.7M
    } else {
612
50.7M
        _is_bf_column = false;
613
50.7M
    }
614
56.3M
    if (column.has_aggregation()) {
615
56.3M
        _aggregation = get_aggregation_type_by_string(column.aggregation());
616
56.3M
        _aggregation_name = column.aggregation();
617
56.3M
    }
618
619
56.1M
    if (_type == FieldType::OLAP_FIELD_TYPE_AGG_STATE) {
620
15.3k
        _result_is_nullable = column.result_is_nullable();
621
15.3k
        _be_exec_version = column.be_exec_version();
622
15.3k
    }
623
624
56.3M
    if (column.has_visible()) {
625
56.3M
        _visible = column.visible();
626
56.3M
    }
627
56.1M
    if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
628
18.4E
        CHECK(column.children_columns_size() == 1)
629
18.4E
                << "ARRAY type should has 1 children types, but got "
630
18.4E
                << column.children_columns_size();
631
1.69M
    }
632
56.1M
    if (_type == FieldType::OLAP_FIELD_TYPE_MAP) {
633
18.4E
        DCHECK(column.children_columns_size() == 2)
634
18.4E
                << "MAP type should has 2 children types, but got "
635
18.4E
                << column.children_columns_size();
636
1.49M
        if (UNLIKELY(column.children_columns_size() != 2)) {
637
0
            LOG(WARNING) << "MAP type should has 2 children types, but got "
638
0
                         << column.children_columns_size();
639
0
        }
640
1.49M
    }
641
61.4M
    for (int i = 0; i < column.children_columns_size(); i++) {
642
5.38M
        TabletColumn child_column;
643
5.38M
        child_column.init_from_pb(column.children_columns(i));
644
5.38M
        add_sub_column(child_column);
645
5.38M
    }
646
56.1M
    if (column.has_column_path_info()) {
647
48.1k
        _column_path = std::make_shared<PathInData>();
648
48.1k
        _column_path->from_protobuf(column.column_path_info());
649
48.1k
        _parent_col_unique_id = column.column_path_info().parrent_column_unique_id();
650
48.1k
    }
651
56.1M
    if (is_variant_type() && !column.has_column_path_info()) {
652
        // set path info for variant root column, to prevent from missing
653
43.5k
        _column_path = std::make_shared<PathInData>(_col_name_lower_case);
654
        // _parent_col_unique_id = _unique_id;
655
43.5k
    }
656
56.1M
    if (column.has_variant_max_subcolumns_count()) {
657
50.9M
        _variant.max_subcolumns_count = column.variant_max_subcolumns_count();
658
50.9M
    }
659
56.1M
    if (column.has_variant_enable_typed_paths_to_sparse()) {
660
50.9M
        _variant.enable_typed_paths_to_sparse = column.variant_enable_typed_paths_to_sparse();
661
50.9M
    }
662
56.1M
    if (column.has_variant_max_sparse_column_statistics_size()) {
663
50.9M
        _variant.max_sparse_column_statistics_size =
664
50.9M
                column.variant_max_sparse_column_statistics_size();
665
50.9M
    }
666
56.1M
    if (column.has_variant_sparse_hash_shard_count()) {
667
48.5M
        _variant.sparse_hash_shard_count = column.variant_sparse_hash_shard_count();
668
48.5M
    }
669
56.1M
    if (column.has_variant_enable_doc_mode()) {
670
51.0M
        _variant.enable_doc_mode = column.variant_enable_doc_mode();
671
51.0M
    }
672
56.1M
    if (column.has_variant_doc_materialization_min_rows()) {
673
48.6M
        _variant.doc_materialization_min_rows = column.variant_doc_materialization_min_rows();
674
48.6M
    }
675
56.1M
    if (column.has_variant_doc_hash_shard_count()) {
676
48.6M
        _variant.doc_hash_shard_count = column.variant_doc_hash_shard_count();
677
48.6M
    }
678
56.1M
    if (column.has_variant_enable_nested_group()) {
679
48.6M
        _variant.enable_nested_group = column.variant_enable_nested_group();
680
48.6M
    }
681
56.1M
    if (column.has_pattern_type()) {
682
25.2M
        _pattern_type = column.pattern_type();
683
25.2M
    }
684
56.1M
}
685
686
TabletColumn TabletColumn::create_materialized_variant_column(const std::string& root,
687
                                                              const std::vector<std::string>& paths,
688
                                                              int32_t parent_unique_id,
689
                                                              int32_t max_subcolumns_count,
690
7.85k
                                                              bool enable_doc_mode) {
691
7.85k
    TabletColumn subcol;
692
7.85k
    subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
693
7.85k
    subcol.set_is_nullable(true);
694
7.85k
    subcol.set_unique_id(-1);
695
7.85k
    subcol.set_parent_unique_id(parent_unique_id);
696
7.85k
    PathInData path(root, paths);
697
7.85k
    subcol.set_path_info(path);
698
7.85k
    subcol.set_name(path.get_path());
699
7.85k
    subcol.set_variant_max_subcolumns_count(max_subcolumns_count);
700
7.85k
    subcol.set_variant_enable_doc_mode(enable_doc_mode);
701
7.85k
    return subcol;
702
7.85k
}
703
704
51.3M
void TabletColumn::to_schema_pb(ColumnPB* column) const {
705
51.3M
    column->set_unique_id(_unique_id);
706
51.3M
    column->set_name(_col_name);
707
51.3M
    column->set_type(get_string_by_field_type(_type));
708
51.3M
    column->set_is_key(_is_key);
709
51.3M
    column->set_is_nullable(_is_nullable);
710
51.3M
    column->set_is_auto_increment(_is_auto_increment);
711
51.3M
    column->set_is_on_update_current_timestamp(_is_on_update_current_timestamp);
712
51.3M
    if (_has_default_value) {
713
7.48M
        column->set_default_value(_default_value);
714
7.48M
    }
715
51.5M
    if (_is_decimal) {
716
51.5M
        column->set_precision(_precision);
717
51.5M
        column->set_frac(_frac);
718
51.5M
    }
719
51.3M
    column->set_length(_length);
720
51.3M
    column->set_index_length(_index_length);
721
51.3M
    if (_is_bf_column) {
722
94.3k
        column->set_is_bf_column(_is_bf_column);
723
94.3k
    }
724
51.7M
    if (!_aggregation_name.empty()) {
725
51.7M
        column->set_aggregation(_aggregation_name);
726
51.7M
    }
727
51.3M
    column->set_result_is_nullable(_result_is_nullable);
728
51.3M
    column->set_be_exec_version(_be_exec_version);
729
51.3M
    column->set_visible(_visible);
730
731
51.3M
    if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
732
18.4E
        CHECK(_sub_columns.size() == 1)
733
18.4E
                << "ARRAY type should has 1 children types, but got " << _sub_columns.size();
734
1.70M
    }
735
51.3M
    if (_type == FieldType::OLAP_FIELD_TYPE_MAP) {
736
991k
        DCHECK(_sub_columns.size() == 2)
737
1.25k
                << "MAP type should has 2 children types, but got " << _sub_columns.size();
738
991k
        if (UNLIKELY(_sub_columns.size() != 2)) {
739
0
            LOG(WARNING) << "MAP type should has 2 children types, but got " << _sub_columns.size();
740
0
        }
741
991k
    }
742
743
55.8M
    for (size_t i = 0; i < _sub_columns.size(); i++) {
744
4.55M
        ColumnPB* child = column->add_children_columns();
745
4.55M
        _sub_columns[i]->to_schema_pb(child);
746
4.55M
    }
747
748
    // set parts info
749
51.3M
    if (has_path_info()) {
750
        // CHECK_GT(_parent_col_unique_id, 0);
751
156k
        _column_path->to_protobuf(column->mutable_column_path_info(), _parent_col_unique_id);
752
        // Update unstable information for variant columns. Some of the fields in the tablet schema
753
        // are irrelevant for variant sub-columns, but retaining them may lead to an excessive growth
754
        // in the number of tablet schema cache entries.
755
156k
        if (_type == FieldType::OLAP_FIELD_TYPE_STRING) {
756
2.15k
            column->set_length(INT_MAX);
757
2.15k
        }
758
156k
        column->set_index_length(0);
759
156k
    }
760
51.3M
    column->set_variant_max_subcolumns_count(_variant.max_subcolumns_count);
761
51.3M
    column->set_pattern_type(_pattern_type);
762
51.3M
    column->set_variant_enable_typed_paths_to_sparse(_variant.enable_typed_paths_to_sparse);
763
51.3M
    column->set_variant_max_sparse_column_statistics_size(
764
51.3M
            _variant.max_sparse_column_statistics_size);
765
51.3M
    column->set_variant_sparse_hash_shard_count(_variant.sparse_hash_shard_count);
766
51.3M
    column->set_variant_enable_doc_mode(_variant.enable_doc_mode);
767
51.3M
    column->set_variant_doc_materialization_min_rows(_variant.doc_materialization_min_rows);
768
51.3M
    column->set_variant_doc_hash_shard_count(_variant.doc_hash_shard_count);
769
51.3M
    column->set_variant_enable_nested_group(_variant.enable_nested_group);
770
51.3M
}
771
772
5.43M
void TabletColumn::add_sub_column(TabletColumn& sub_column) {
773
5.43M
    _sub_columns.push_back(std::make_shared<TabletColumn>(sub_column));
774
5.43M
    sub_column._parent_col_unique_id = this->_unique_id;
775
5.43M
    _sub_column_count += 1;
776
5.43M
}
777
778
40.3M
bool TabletColumn::is_row_store_column() const {
779
40.3M
    return _col_name == BeConsts::ROW_STORE_COL;
780
40.3M
}
781
782
AggregateFunctionPtr TabletColumn::get_aggregate_function_union(DataTypePtr type,
783
1.42k
                                                                int current_be_exec_version) const {
784
1.42k
    const auto* state_type = assert_cast<const DataTypeAggState*>(type.get());
785
1.42k
    BeExecVersionManager::check_function_compatibility(
786
1.42k
            current_be_exec_version, _be_exec_version,
787
1.42k
            state_type->get_nested_function()->get_name());
788
1.42k
    return AggregateStateUnion::create(state_type->get_nested_function(), {type}, type);
789
1.42k
}
790
791
AggregateFunctionPtr TabletColumn::get_aggregate_function(std::string suffix,
792
68.3k
                                                          int current_be_exec_version) const {
793
68.3k
    AggregateFunctionPtr function = nullptr;
794
795
68.3k
    auto type = DataTypeFactory::instance().create_data_type(*this);
796
68.3k
    if (type && type->get_primitive_type() == PrimitiveType::TYPE_AGG_STATE) {
797
1.42k
        function = get_aggregate_function_union(type, current_be_exec_version);
798
66.9k
    } else {
799
66.9k
        std::string origin_name = TabletColumn::get_string_by_aggregation_type(_aggregation);
800
66.9k
        std::string agg_name = origin_name + suffix;
801
66.9k
        std::transform(agg_name.begin(), agg_name.end(), agg_name.begin(),
802
938k
                       [](unsigned char c) { return std::tolower(c); });
803
66.9k
        function = AggregateFunctionSimpleFactory::instance().get(
804
66.9k
                agg_name, {type}, type, type->is_nullable(),
805
66.9k
                BeExecVersionManager::get_newest_version());
806
66.9k
        if (!function) {
807
0
            LOG(WARNING) << "get column aggregate function failed, aggregation_name=" << origin_name
808
0
                         << ", column_type=" << type->get_name();
809
0
        }
810
66.9k
    }
811
68.3k
    if (function) {
812
68.3k
        function->set_version(_be_exec_version);
813
68.3k
        return function;
814
68.3k
    }
815
18.4E
    return nullptr;
816
68.3k
}
817
818
130k
void TabletColumn::set_path_info(const PathInData& path) {
819
130k
    _column_path = std::make_shared<PathInData>(path);
820
130k
}
821
822
14.6k
DataTypePtr TabletColumn::get_vec_type() const {
823
14.6k
    return DataTypeFactory::instance().create_data_type(*this);
824
14.6k
}
825
826
// escape '.' and '_'
827
58.4M
std::string escape_for_path_name(const std::string& s) {
828
58.4M
    std::string res;
829
58.4M
    const char* pos = s.data();
830
58.4M
    const char* end = pos + s.size();
831
59.1M
    while (pos != end) {
832
679k
        unsigned char c = *pos;
833
679k
        if (c == '.' || c == '_') {
834
94.1k
            res += '%';
835
94.1k
            res += hex_digit_uppercase(c / 16);
836
94.1k
            res += hex_digit_uppercase(c % 16);
837
585k
        } else {
838
585k
            res += c;
839
585k
        }
840
679k
        ++pos;
841
679k
    }
842
58.4M
    return res;
843
58.4M
}
844
845
14.3k
void TabletIndex::set_escaped_escaped_index_suffix_path(const std::string& path_name) {
846
14.3k
    std::string escaped_path = escape_for_path_name(path_name);
847
14.3k
    _escaped_index_suffix_path = escaped_path;
848
14.3k
}
849
850
void TabletIndex::init_from_thrift(const TOlapTableIndex& index,
851
300k
                                   const TabletSchema& tablet_schema) {
852
300k
    _index_id = index.index_id;
853
300k
    _index_name = index.index_name;
854
    // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
855
    // get column unique id by name
856
300k
    std::vector<int32_t> col_unique_ids(index.columns.size());
857
601k
    for (size_t i = 0; i < index.columns.size(); i++) {
858
300k
        auto column_idx = tablet_schema.field_index(index.columns[i]);
859
300k
        if (column_idx >= 0) {
860
300k
            col_unique_ids[i] = tablet_schema.column(column_idx).unique_id();
861
300k
        } else {
862
            // if column unique id not found by column name, find by column unique id
863
            // column unique id can not found means this column is a new column added by light schema change
864
214
            if (index.__isset.column_unique_ids && !index.column_unique_ids.empty() &&
865
214
                tablet_schema.has_column_unique_id(index.column_unique_ids[i])) {
866
136
                col_unique_ids[i] = index.column_unique_ids[i];
867
136
            } else {
868
78
                col_unique_ids[i] = -1;
869
78
            }
870
214
        }
871
300k
    }
872
300k
    _col_unique_ids = std::move(col_unique_ids);
873
874
300k
    switch (index.index_type) {
875
0
    case TIndexType::BITMAP:
876
0
        _index_type = IndexType::BITMAP;
877
0
        break;
878
273k
    case TIndexType::INVERTED:
879
273k
        _index_type = IndexType::INVERTED;
880
273k
        break;
881
762
    case TIndexType::ANN:
882
762
        _index_type = IndexType::ANN;
883
762
        break;
884
0
    case TIndexType::BLOOMFILTER:
885
0
        _index_type = IndexType::BLOOMFILTER;
886
0
        break;
887
26.5k
    case TIndexType::NGRAM_BF:
888
26.5k
        _index_type = IndexType::NGRAM_BF;
889
26.5k
        break;
890
300k
    }
891
300k
    if (index.__isset.properties) {
892
300k
        for (auto kv : index.properties) {
893
256k
            _properties[kv.first] = kv.second;
894
256k
        }
895
300k
    }
896
300k
}
897
898
void TabletIndex::init_from_thrift(const TOlapTableIndex& index,
899
15.3k
                                   const std::vector<int32_t>& column_uids) {
900
15.3k
    _index_id = index.index_id;
901
15.3k
    _index_name = index.index_name;
902
15.3k
    _col_unique_ids = column_uids;
903
904
15.3k
    switch (index.index_type) {
905
0
    case TIndexType::BITMAP:
906
0
        _index_type = IndexType::BITMAP;
907
0
        break;
908
14.6k
    case TIndexType::INVERTED:
909
14.6k
        _index_type = IndexType::INVERTED;
910
14.6k
        break;
911
135
    case TIndexType::ANN:
912
135
        _index_type = IndexType::ANN;
913
135
        break;
914
0
    case TIndexType::BLOOMFILTER:
915
0
        _index_type = IndexType::BLOOMFILTER;
916
0
        break;
917
584
    case TIndexType::NGRAM_BF:
918
584
        _index_type = IndexType::NGRAM_BF;
919
584
        break;
920
15.3k
    }
921
15.3k
    if (index.__isset.properties) {
922
17.1k
        for (auto kv : index.properties) {
923
17.1k
            _properties[kv.first] = kv.second;
924
17.1k
        }
925
15.3k
    }
926
15.3k
}
927
928
918k
void TabletIndex::init_from_pb(const TabletIndexPB& index) {
929
918k
    _index_id = index.index_id();
930
918k
    _index_name = index.index_name();
931
918k
    _col_unique_ids.clear();
932
919k
    for (auto col_unique_id : index.col_unique_id()) {
933
919k
        _col_unique_ids.push_back(col_unique_id);
934
919k
    }
935
918k
    _index_type = index.index_type();
936
918k
    for (const auto& kv : index.properties()) {
937
573k
        _properties[kv.first] = kv.second;
938
573k
    }
939
918k
    _escaped_index_suffix_path = index.index_suffix_name();
940
918k
}
941
942
1.80M
void TabletIndex::to_schema_pb(TabletIndexPB* index) const {
943
1.80M
    index->set_index_id(_index_id);
944
1.80M
    index->set_index_name(_index_name);
945
1.80M
    index->clear_col_unique_id();
946
1.80M
    for (auto col_unique_id : _col_unique_ids) {
947
1.80M
        index->add_col_unique_id(col_unique_id);
948
1.80M
    }
949
1.80M
    index->set_index_type(_index_type);
950
1.80M
    for (const auto& kv : _properties) {
951
1.38M
        DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", {
952
1.38M
            if (kv.first == INVERTED_INDEX_PARSER_LOWERCASE_KEY) {
953
1.38M
                continue;
954
1.38M
            }
955
1.38M
        })
956
1.38M
        (*index->mutable_properties())[kv.first] = kv.second;
957
1.38M
    }
958
1.80M
    index->set_index_suffix_name(_escaped_index_suffix_path);
959
960
1.80M
    DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })
961
962
    // Only add lower_case=true default for built-in analyzers/parsers, NOT for custom analyzers
963
    // Custom analyzer: lower_case is determined by analyzer's internal token filter
964
1.80M
    if (!_properties.empty() && !_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
965
56.9k
        bool has_parser = _properties.contains(INVERTED_INDEX_PARSER_KEY) ||
966
56.9k
                          _properties.contains(INVERTED_INDEX_PARSER_KEY_ALIAS);
967
56.9k
        std::string analyzer_name = get_analyzer_name_from_properties(_properties);
968
56.9k
        bool is_builtin = analyzer_name.empty() ||
969
56.9k
                          segment_v2::inverted_index::InvertedIndexAnalyzer::is_builtin_analyzer(
970
3.25k
                                  analyzer_name);
971
56.9k
        if (has_parser || is_builtin) {
972
53.6k
            (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
973
53.6k
                    INVERTED_INDEX_PARSER_TRUE;
974
53.6k
        }
975
56.9k
    }
976
1.80M
}
977
978
3.41M
TabletSchema::TabletSchema() = default;
979
980
3.38M
TabletSchema::~TabletSchema() {}
981
982
2.18M
int64_t TabletSchema::get_metadata_size() const {
983
2.18M
    return sizeof(TabletSchema);
984
2.18M
}
985
986
22.6M
void TabletSchema::append_column(TabletColumn column, ColumnType col_type) {
987
22.6M
    if (column.is_key()) {
988
7.36M
        _num_key_columns++;
989
7.36M
    }
990
22.6M
    if (column.is_nullable()) {
991
11.9M
        _num_null_columns++;
992
11.9M
    }
993
22.6M
    if (column.is_variant_type()) {
994
27.4k
        ++_num_variant_columns;
995
27.4k
        if (!column.has_path_info()) {
996
28
            const std::string& col_name = column.name_lower_case();
997
28
            PathInData path(col_name);
998
28
            column.set_path_info(path);
999
28
        }
1000
27.4k
    }
1001
22.6M
    if (UNLIKELY(column.name() == DELETE_SIGN)) {
1002
947k
        _delete_sign_idx = _num_columns;
1003
21.6M
    } else if (UNLIKELY(column.name() == SEQUENCE_COL)) {
1004
7.51k
        _sequence_col_idx = _num_columns;
1005
21.6M
    } else if (UNLIKELY(column.name() == VERSION_COL)) {
1006
947k
        _version_col_idx = _num_columns;
1007
20.7M
    } else if (UNLIKELY(column.name() == SKIP_BITMAP_COL)) {
1008
1.00k
        _skip_bitmap_col_idx = _num_columns;
1009
20.7M
    } else if (UNLIKELY(column.name() == BINLOG_LSN_COL)) {
1010
0
        _binlog_lsn_col_idx = _num_columns;
1011
20.7M
    } else if (UNLIKELY(column.name() == BINLOG_OP_COL)) {
1012
0
        _binlog_op_col_idx = _num_columns;
1013
20.7M
    } else if (UNLIKELY(column.name() == BINLOG_TSO_COL)) {
1014
0
        _binlog_tso_col_idx = _num_columns;
1015
20.7M
    } else if (UNLIKELY(column.name().starts_with(BeConsts::VIRTUAL_COLUMN_PREFIX))) {
1016
381
        _vir_col_idx_to_unique_id[_num_columns] = column.unique_id();
1017
381
    }
1018
22.6M
    _field_uniqueid_to_index[column.unique_id()] = _num_columns;
1019
22.6M
    _cols.push_back(std::make_shared<TabletColumn>(std::move(column)));
1020
    // The dropped column may have same name with exsiting column, so that
1021
    // not add to name to index map, only for uid to index map
1022
22.6M
    if (col_type == ColumnType::VARIANT || _cols.back()->is_variant_type() ||
1023
22.6M
        _cols.back()->is_extracted_column()) {
1024
32.1k
        _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1025
32.1k
        _field_path_to_index[_cols.back()->path_info_ptr().get()] = _num_columns;
1026
22.6M
    } else if (col_type == ColumnType::NORMAL) {
1027
22.6M
        _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1028
22.6M
    }
1029
22.6M
    _num_columns++;
1030
22.6M
    _num_virtual_columns = _vir_col_idx_to_unique_id.size();
1031
    // generate column index mapping for seq map
1032
22.6M
    if (_seq_col_uid_to_value_cols_uid.contains(column.unique_id())) {
1033
42
        const auto seq_idx = _field_uniqueid_to_index[column.unique_id()];
1034
42
        if (!_seq_col_idx_to_value_cols_idx.contains(seq_idx)) {
1035
38
            _seq_col_idx_to_value_cols_idx[seq_idx] = {};
1036
38
        }
1037
42
    }
1038
22.6M
    if (_value_col_uid_to_seq_col_uid.contains(column.unique_id())) {
1039
102
        const auto seq_uid = _value_col_uid_to_seq_col_uid[column.unique_id()];
1040
102
        if (_field_uniqueid_to_index.contains(seq_uid)) {
1041
48
            bool all_uid_index_found = true;
1042
48
            std::vector<int32_t> value_cols_index;
1043
68
            for (const auto value_col_uid : _seq_col_uid_to_value_cols_uid[seq_uid]) {
1044
68
                if (!_field_uniqueid_to_index.contains(value_col_uid)) {
1045
6
                    all_uid_index_found = false;
1046
6
                    break;
1047
6
                }
1048
62
                value_cols_index.push_back(_field_uniqueid_to_index[value_col_uid]);
1049
62
            }
1050
48
            if (all_uid_index_found) {
1051
42
                const auto seq_idx = _field_uniqueid_to_index[seq_uid];
1052
60
                for (const auto col_idx : value_cols_index) {
1053
60
                    _seq_col_idx_to_value_cols_idx[seq_idx].push_back(col_idx);
1054
60
                    _value_col_idx_to_seq_col_idx[col_idx] = seq_idx;
1055
60
                }
1056
42
                _value_col_idx_to_seq_col_idx[seq_idx] = seq_idx;
1057
42
            }
1058
48
        }
1059
102
    }
1060
22.6M
}
1061
1062
1.53k
void TabletSchema::append_index(TabletIndex&& index) {
1063
1.53k
    size_t index_pos = _indexes.size();
1064
1.53k
    _indexes.push_back(std::make_shared<TabletIndex>(index));
1065
1.54k
    for (int32_t id : _indexes.back()->col_unique_ids()) {
1066
1.54k
        if (auto field_pattern = _indexes.back()->field_pattern(); !field_pattern.empty()) {
1067
9
            auto& pattern_to_index_map = _index_by_unique_id_with_pattern[id];
1068
9
            pattern_to_index_map[field_pattern].emplace_back(_indexes.back());
1069
1.53k
        } else {
1070
1.53k
            IndexKey key = std::make_tuple(_indexes.back()->index_type(), id,
1071
1.53k
                                           _indexes.back()->get_index_suffix());
1072
1.53k
            _col_id_suffix_to_index[key].push_back(index_pos);
1073
1.53k
        }
1074
1.54k
    }
1075
1.53k
}
1076
1077
0
void TabletSchema::replace_column(size_t pos, TabletColumn new_col) {
1078
0
    CHECK_LT(pos, num_columns()) << " outof range";
1079
0
    _cols[pos] = std::make_shared<TabletColumn>(std::move(new_col));
1080
0
}
1081
1082
827
void TabletSchema::clear_index() {
1083
827
    _indexes.clear();
1084
827
    _col_id_suffix_to_index.clear();
1085
827
    _index_by_unique_id_with_pattern.clear();
1086
827
}
1087
1088
7
void TabletSchema::remove_index(int64_t index_id) {
1089
7
    std::vector<TabletIndexPtr> new_indexes;
1090
11
    for (auto& index : _indexes) {
1091
11
        if (index->index_id() != index_id) {
1092
4
            new_indexes.emplace_back(std::move(index));
1093
4
        }
1094
11
    }
1095
7
    _indexes = std::move(new_indexes);
1096
7
    _col_id_suffix_to_index.clear();
1097
7
    _index_by_unique_id_with_pattern.clear();
1098
11
    for (size_t new_pos = 0; new_pos < _indexes.size(); ++new_pos) {
1099
4
        const auto& index = _indexes[new_pos];
1100
4
        for (int32_t col_uid : index->col_unique_ids()) {
1101
4
            if (auto field_pattern = index->field_pattern(); !field_pattern.empty()) {
1102
0
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1103
0
                pattern_to_index_map[field_pattern].emplace_back(index);
1104
4
            } else {
1105
4
                IndexKey key = std::make_tuple(_indexes.back()->index_type(), col_uid,
1106
4
                                               _indexes.back()->get_index_suffix());
1107
4
                _col_id_suffix_to_index[key].push_back(new_pos);
1108
4
            }
1109
4
        }
1110
4
    }
1111
7
}
1112
1113
1.45M
void TabletSchema::clear_columns() {
1114
1.45M
    _field_path_to_index.clear();
1115
1.45M
    _field_name_to_index.clear();
1116
1.45M
    _field_uniqueid_to_index.clear();
1117
1.45M
    _num_columns = 0;
1118
1.45M
    _num_variant_columns = 0;
1119
1.45M
    _num_null_columns = 0;
1120
1.45M
    _num_key_columns = 0;
1121
1.45M
    _seq_col_idx_to_value_cols_idx.clear();
1122
1.45M
    _value_col_idx_to_seq_col_idx.clear();
1123
1.45M
    _cols.clear();
1124
1.45M
}
1125
1126
void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns,
1127
2.08M
                                bool reuse_cache_column) {
1128
2.08M
    _keys_type = schema.keys_type();
1129
2.08M
    _num_columns = 0;
1130
2.08M
    _num_variant_columns = 0;
1131
2.08M
    _num_key_columns = 0;
1132
2.08M
    _num_null_columns = 0;
1133
2.08M
    _cols.clear();
1134
2.08M
    _indexes.clear();
1135
2.08M
    _index_by_unique_id_with_pattern.clear();
1136
2.08M
    _col_id_suffix_to_index.clear();
1137
2.08M
    _field_name_to_index.clear();
1138
2.08M
    _field_uniqueid_to_index.clear();
1139
2.08M
    _cluster_key_uids.clear();
1140
2.08M
    for (const auto& i : schema.cluster_key_uids()) {
1141
62.5k
        _cluster_key_uids.push_back(i);
1142
62.5k
    }
1143
27.6M
    for (auto& column_pb : schema.column()) {
1144
27.6M
        TabletColumnPtr column;
1145
27.6M
        if (reuse_cache_column) {
1146
506k
            auto pair = TabletColumnObjectPool::instance()->insert(
1147
506k
                    deterministic_string_serialize(column_pb));
1148
506k
            column = pair.second;
1149
            // Release the handle quickly, because we use shared ptr to manage column.
1150
            // It often core during tablet schema copy to another schema because handle's
1151
            // reference count should be managed mannually.
1152
506k
            TabletColumnObjectPool::instance()->release(pair.first);
1153
27.1M
        } else {
1154
27.1M
            column = std::make_shared<TabletColumn>();
1155
27.1M
            column->init_from_pb(column_pb);
1156
27.1M
        }
1157
27.6M
        if (ignore_extracted_columns && column->is_extracted_column()) {
1158
0
            continue;
1159
0
        }
1160
27.6M
        if (column->is_key()) {
1161
8.24M
            _num_key_columns++;
1162
8.24M
        }
1163
27.6M
        if (column->is_nullable()) {
1164
14.8M
            _num_null_columns++;
1165
14.8M
        }
1166
27.6M
        if (column->is_variant_type()) {
1167
52.1k
            ++_num_variant_columns;
1168
52.1k
        }
1169
1170
27.6M
        _cols.emplace_back(std::move(column));
1171
27.6M
        if (!_cols.back()->is_extracted_column()) {
1172
27.6M
            _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1173
27.6M
            _field_uniqueid_to_index[_cols.back()->unique_id()] = _num_columns;
1174
27.6M
        }
1175
27.6M
        _num_columns++;
1176
27.6M
    }
1177
2.08M
    for (const auto& index_pb : schema.index()) {
1178
945k
        TabletIndexPtr index;
1179
945k
        if (reuse_cache_column) {
1180
43.3k
            auto pair = TabletColumnObjectPool::instance()->insert_index(
1181
43.3k
                    deterministic_string_serialize(index_pb));
1182
43.3k
            index = pair.second;
1183
            //  Only need the value to be cached by the pool, release it quickly because the handle need
1184
            // record reference count mannually, or it will core during tablet schema copy method.
1185
43.3k
            TabletColumnObjectPool::instance()->release(pair.first);
1186
901k
        } else {
1187
901k
            index = std::make_shared<TabletIndex>();
1188
901k
            index->init_from_pb(index_pb);
1189
901k
        }
1190
945k
        size_t index_pos = _indexes.size();
1191
945k
        _indexes.emplace_back(std::move(index));
1192
945k
        for (int32_t col_uid : _indexes.back()->col_unique_ids()) {
1193
945k
            if (auto field_pattern = _indexes.back()->field_pattern(); !field_pattern.empty()) {
1194
34.1k
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1195
34.1k
                pattern_to_index_map[field_pattern].emplace_back(_indexes.back());
1196
910k
            } else {
1197
910k
                IndexKey key = std::make_tuple(_indexes.back()->index_type(), col_uid,
1198
910k
                                               _indexes.back()->get_index_suffix());
1199
910k
                _col_id_suffix_to_index[key].push_back(index_pos);
1200
910k
            }
1201
945k
        }
1202
945k
    }
1203
2.08M
    _num_short_key_columns = schema.num_short_key_columns();
1204
2.08M
    _num_rows_per_row_block = schema.num_rows_per_row_block();
1205
2.08M
    _compress_kind = schema.compress_kind();
1206
2.08M
    _next_column_unique_id = schema.next_column_unique_id();
1207
2.08M
    if (schema.has_bf_fpp()) {
1208
1.35M
        _has_bf_fpp = true;
1209
1.35M
        _bf_fpp = schema.bf_fpp();
1210
1.35M
    } else {
1211
729k
        _has_bf_fpp = false;
1212
729k
        _bf_fpp = BLOOM_FILTER_DEFAULT_FPP;
1213
729k
    }
1214
2.08M
    _is_in_memory = schema.is_in_memory();
1215
2.08M
    _disable_auto_compaction = schema.disable_auto_compaction();
1216
2.08M
    _store_row_column = schema.store_row_column();
1217
2.08M
    _skip_write_index_on_load = schema.skip_write_index_on_load();
1218
2.08M
    _delete_sign_idx = schema.delete_sign_idx();
1219
2.08M
    _sequence_col_idx = schema.sequence_col_idx();
1220
2.08M
    _version_col_idx = schema.version_col_idx();
1221
2.08M
    _skip_bitmap_col_idx = schema.skip_bitmap_col_idx();
1222
2.08M
    _binlog_lsn_col_idx = schema.binlog_lsn_col_idx();
1223
2.08M
    _binlog_op_col_idx = schema.binlog_op_col_idx();
1224
2.08M
    _binlog_tso_col_idx = schema.binlog_tso_col_idx();
1225
2.08M
    _sort_type = schema.sort_type();
1226
2.08M
    _sort_col_num = schema.sort_col_num();
1227
2.08M
    _compression_type = schema.compression_type();
1228
2.08M
    _row_store_page_size = schema.row_store_page_size();
1229
2.08M
    _storage_page_size = schema.storage_page_size();
1230
2.08M
    _storage_dict_page_size = schema.storage_dict_page_size();
1231
2.08M
    _schema_version = schema.schema_version();
1232
2.08M
    if (schema.has_seq_map()) {
1233
2.08M
        auto column_groups_pb = schema.seq_map();
1234
2.08M
        _seq_col_uid_to_value_cols_uid.clear();
1235
2.08M
        _value_col_uid_to_seq_col_uid.clear();
1236
2.08M
        _seq_col_idx_to_value_cols_idx.clear();
1237
2.08M
        _value_col_idx_to_seq_col_idx.clear();
1238
        /*
1239
         * ColumnGroupsPB is a list of cg_pb, and
1240
         * ColumnGroupsPB do not have begin() or end() method.
1241
         * we must use for(i=0;i<xx;i++) loop
1242
         */
1243
2.08M
        for (int i = 0; i < column_groups_pb.cg_size(); i++) {
1244
425
            ColumnGroupPB cg_pb = column_groups_pb.cg(i);
1245
425
            uint32_t key_uid = cg_pb.sequence_column();
1246
425
            auto found = _field_uniqueid_to_index.find(key_uid);
1247
425
            DCHECK(found != _field_uniqueid_to_index.end())
1248
0
                    << "could not find sequence col with unique id = " << key_uid
1249
0
                    << " table_id=" << _table_id;
1250
425
            int32_t seq_index = found->second;
1251
425
            _seq_col_uid_to_value_cols_uid[key_uid] = {};
1252
425
            _seq_col_idx_to_value_cols_idx[seq_index] = {};
1253
636
            for (auto val_uid : cg_pb.columns_in_group()) {
1254
636
                _seq_col_uid_to_value_cols_uid[key_uid].push_back(val_uid);
1255
636
                found = _field_uniqueid_to_index.find(val_uid);
1256
636
                DCHECK(found != _field_uniqueid_to_index.end())
1257
0
                        << "could not find value col with unique id = " << key_uid
1258
0
                        << " table_id=" << _table_id;
1259
636
                int32_t val_index = found->second;
1260
636
                _seq_col_idx_to_value_cols_idx[seq_index].push_back(val_index);
1261
636
            }
1262
425
        }
1263
1264
2.08M
        if (!_seq_col_uid_to_value_cols_uid.empty()) {
1265
            /*
1266
                |** KEY **|        ** VALUE **     |
1267
                ------------------------------------
1268
                |** KEY **|  CDE is value| sequence|
1269
                |----|----|----|----|----|----|----|
1270
                A    B    C    D    E   S1   S2
1271
                0    1    2    3    4    5    6
1272
                for example: _seq_map is {5:{2,3}, 6:{4}}
1273
                then, _value_to_seq = {2:5,3:5,5:5,4:6,6:6}
1274
            */
1275
425
            for (auto& [seq_uid, cols_uid] : _seq_col_uid_to_value_cols_uid) {
1276
636
                for (auto col_uid : cols_uid) {
1277
636
                    _value_col_uid_to_seq_col_uid[col_uid] = seq_uid;
1278
636
                }
1279
425
                _value_col_uid_to_seq_col_uid[seq_uid] = seq_uid;
1280
425
            }
1281
1282
425
            for (auto& [seq_idx, value_cols_idx] : _seq_col_idx_to_value_cols_idx) {
1283
636
                for (auto col_idx : value_cols_idx) {
1284
636
                    _value_col_idx_to_seq_col_idx[col_idx] = seq_idx;
1285
636
                }
1286
425
                _value_col_idx_to_seq_col_idx[seq_idx] = seq_idx;
1287
425
            }
1288
296
        }
1289
2.08M
    }
1290
    // Default to V1 inverted index storage format for backward compatibility if not specified in schema.
1291
2.08M
    if (!schema.has_inverted_index_storage_format()) {
1292
528
        _inverted_index_storage_format = InvertedIndexStorageFormatPB::V1;
1293
2.08M
    } else {
1294
2.08M
        _inverted_index_storage_format = schema.inverted_index_storage_format();
1295
2.08M
    }
1296
1297
2.08M
    _row_store_column_unique_ids.assign(schema.row_store_column_unique_ids().begin(),
1298
2.08M
                                        schema.row_store_column_unique_ids().end());
1299
2.08M
    _deprecated_enable_variant_flatten_nested = schema.enable_variant_flatten_nested();
1300
2.08M
    if (schema.has_storage_format()) {
1301
2.07M
        _storage_format = schema.storage_format();
1302
2.07M
    } else if (schema.is_external_segment_column_meta_used() ||
1303
9.17k
               schema.integer_type_default_use_plain_encoding() ||
1304
9.17k
               schema.binary_plain_encoding_default_impl() ==
1305
8.90k
                       BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2) {
1306
        // Old PB without storage_format: any of the three legacy V3-flavor flags implies V3.
1307
1
        _storage_format = TabletStorageFormatPB::TABLET_STORAGE_FORMAT_V3;
1308
9.17k
    } else {
1309
9.17k
        _storage_format = TabletStorageFormatPB::TABLET_STORAGE_FORMAT_V2;
1310
9.17k
    }
1311
2.08M
    update_metadata_size();
1312
2.08M
}
1313
1314
1.26M
void TabletSchema::copy_from(const TabletSchema& tablet_schema) {
1315
1.26M
    TabletSchemaPB tablet_schema_pb;
1316
1.26M
    tablet_schema.to_schema_pb(&tablet_schema_pb);
1317
1.26M
    init_from_pb(tablet_schema_pb);
1318
1.26M
    _table_id = tablet_schema.table_id();
1319
1.26M
    _path_set_info_map = tablet_schema._path_set_info_map;
1320
1.26M
}
1321
1322
78.8k
void TabletSchema::shawdow_copy_without_columns(const TabletSchema& tablet_schema) {
1323
78.8k
    *this = tablet_schema;
1324
78.8k
    _field_path_to_index.clear();
1325
78.8k
    _field_name_to_index.clear();
1326
78.8k
    _field_uniqueid_to_index.clear();
1327
78.8k
    _num_columns = 0;
1328
78.8k
    _num_variant_columns = 0;
1329
78.8k
    _num_null_columns = 0;
1330
78.8k
    _num_key_columns = 0;
1331
78.8k
    _cols.clear();
1332
78.8k
    _delete_sign_idx = -1;
1333
78.8k
    _sequence_col_idx = -1;
1334
78.8k
    _version_col_idx = -1;
1335
78.8k
    _skip_bitmap_col_idx = -1;
1336
78.8k
    _binlog_lsn_col_idx = -1;
1337
78.8k
    _binlog_op_col_idx = -1;
1338
78.8k
    _binlog_tso_col_idx = -1;
1339
78.8k
}
1340
1341
0
void TabletSchema::update_index_info_from(const TabletSchema& tablet_schema) {
1342
0
    for (auto& col : _cols) {
1343
0
        if (col->unique_id() < 0) {
1344
0
            continue;
1345
0
        }
1346
0
        const auto iter = tablet_schema._field_uniqueid_to_index.find(col->unique_id());
1347
0
        if (iter == tablet_schema._field_uniqueid_to_index.end()) {
1348
0
            continue;
1349
0
        }
1350
0
        auto col_idx = iter->second;
1351
0
        if (col_idx < 0 || col_idx >= tablet_schema._cols.size()) {
1352
0
            continue;
1353
0
        }
1354
0
        col->set_is_bf_column(tablet_schema._cols[col_idx]->is_bf_column());
1355
0
    }
1356
0
}
1357
1358
1.27M
std::string TabletSchema::to_key() const {
1359
1.27M
    TabletSchemaPB pb;
1360
1.27M
    to_schema_pb(&pb);
1361
1.27M
    return TabletSchema::deterministic_string_serialize(pb);
1362
1.27M
}
1363
1364
void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version,
1365
                                               const OlapTableIndexSchema* index,
1366
56.3k
                                               const TabletSchema& ori_tablet_schema) {
1367
    // copy from ori_tablet_schema
1368
56.3k
    _keys_type = ori_tablet_schema.keys_type();
1369
56.3k
    _num_short_key_columns = ori_tablet_schema.num_short_key_columns();
1370
56.3k
    _num_rows_per_row_block = ori_tablet_schema.num_rows_per_row_block();
1371
56.3k
    _compress_kind = ori_tablet_schema.compress_kind();
1372
1373
    // todo(yixiu): unique_id
1374
56.3k
    _next_column_unique_id = ori_tablet_schema.next_column_unique_id();
1375
56.3k
    _is_in_memory = ori_tablet_schema.is_in_memory();
1376
56.3k
    _disable_auto_compaction = ori_tablet_schema.disable_auto_compaction();
1377
56.3k
    _skip_write_index_on_load = ori_tablet_schema.skip_write_index_on_load();
1378
56.3k
    _sort_type = ori_tablet_schema.sort_type();
1379
56.3k
    _sort_col_num = ori_tablet_schema.sort_col_num();
1380
56.3k
    _row_store_page_size = ori_tablet_schema.row_store_page_size();
1381
56.3k
    _storage_page_size = ori_tablet_schema.storage_page_size();
1382
56.3k
    _storage_dict_page_size = ori_tablet_schema.storage_dict_page_size();
1383
56.3k
    _deprecated_enable_variant_flatten_nested =
1384
56.3k
            ori_tablet_schema.deprecated_variant_flatten_nested();
1385
1386
    // copy from table_schema_param
1387
56.3k
    _schema_version = version;
1388
56.3k
    _num_columns = 0;
1389
56.3k
    _num_variant_columns = 0;
1390
56.3k
    _num_key_columns = 0;
1391
56.3k
    _num_null_columns = 0;
1392
56.3k
    bool has_bf_columns = false;
1393
56.3k
    _cols.clear();
1394
56.3k
    _indexes.clear();
1395
56.3k
    _col_id_suffix_to_index.clear();
1396
56.3k
    _index_by_unique_id_with_pattern.clear();
1397
56.3k
    _field_name_to_index.clear();
1398
56.3k
    _field_uniqueid_to_index.clear();
1399
56.3k
    _delete_sign_idx = -1;
1400
56.3k
    _sequence_col_idx = -1;
1401
56.3k
    _version_col_idx = -1;
1402
56.3k
    _skip_bitmap_col_idx = -1;
1403
56.3k
    _binlog_lsn_col_idx = -1;
1404
56.3k
    _binlog_op_col_idx = -1;
1405
56.3k
    _binlog_tso_col_idx = -1;
1406
56.3k
    _cluster_key_uids.clear();
1407
56.3k
    for (const auto& i : ori_tablet_schema._cluster_key_uids) {
1408
1.41k
        _cluster_key_uids.push_back(i);
1409
1.41k
    }
1410
623k
    for (auto& column : index->columns) {
1411
623k
        if (column->is_key()) {
1412
104k
            _num_key_columns++;
1413
104k
        }
1414
623k
        if (column->is_nullable()) {
1415
341k
            _num_null_columns++;
1416
341k
        }
1417
623k
        if (column->is_bf_column()) {
1418
3.81k
            has_bf_columns = true;
1419
3.81k
        }
1420
623k
        if (column->is_variant_type()) {
1421
2.60k
            ++_num_variant_columns;
1422
2.60k
        }
1423
623k
        if (UNLIKELY(column->name() == DELETE_SIGN)) {
1424
12.9k
            _delete_sign_idx = _num_columns;
1425
610k
        } else if (UNLIKELY(column->name() == SEQUENCE_COL)) {
1426
705
            _sequence_col_idx = _num_columns;
1427
609k
        } else if (UNLIKELY(column->name() == VERSION_COL)) {
1428
12.8k
            _version_col_idx = _num_columns;
1429
596k
        } else if (UNLIKELY(column->name() == SKIP_BITMAP_COL)) {
1430
119
            _skip_bitmap_col_idx = _num_columns;
1431
596k
        } else if (UNLIKELY(column->name() == BINLOG_LSN_COL)) {
1432
0
            _binlog_lsn_col_idx = _num_columns;
1433
596k
        } else if (UNLIKELY(column->name() == BINLOG_OP_COL)) {
1434
0
            _binlog_op_col_idx = _num_columns;
1435
596k
        } else if (UNLIKELY(column->name() == BINLOG_TSO_COL)) {
1436
0
            _binlog_tso_col_idx = _num_columns;
1437
0
        }
1438
        // Reuse TabletColumn object from pool to reduce memory consumption
1439
623k
        TabletColumnPtr new_column;
1440
623k
        ColumnPB column_pb;
1441
623k
        column->to_schema_pb(&column_pb);
1442
623k
        auto pair = TabletColumnObjectPool::instance()->insert(
1443
623k
                deterministic_string_serialize(column_pb));
1444
623k
        new_column = pair.second;
1445
        // Release the handle quickly, because we use shared ptr to manage column
1446
623k
        TabletColumnObjectPool::instance()->release(pair.first);
1447
623k
        _cols.emplace_back(std::move(new_column));
1448
623k
        _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns);
1449
623k
        _field_uniqueid_to_index[_cols.back()->unique_id()] = _num_columns;
1450
623k
        _num_columns++;
1451
623k
    }
1452
1453
56.3k
    for (const auto& i : index->indexes) {
1454
55.1k
        size_t index_pos = _indexes.size();
1455
        // Reuse TabletIndex object from pool to reduce memory consumption
1456
55.1k
        TabletIndexPtr new_index;
1457
55.1k
        TabletIndexPB index_pb;
1458
55.1k
        i->to_schema_pb(&index_pb);
1459
55.1k
        auto pair = TabletColumnObjectPool::instance()->insert_index(
1460
55.1k
                deterministic_string_serialize(index_pb));
1461
55.1k
        new_index = pair.second;
1462
        // Release the handle quickly, because we use shared ptr to manage index
1463
55.1k
        TabletColumnObjectPool::instance()->release(pair.first);
1464
55.1k
        _indexes.emplace_back(std::move(new_index));
1465
55.1k
        for (int32_t col_uid : _indexes.back()->col_unique_ids()) {
1466
55.1k
            if (auto field_pattern = _indexes.back()->field_pattern(); !field_pattern.empty()) {
1467
1.34k
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1468
1.34k
                pattern_to_index_map[field_pattern].emplace_back(_indexes.back());
1469
53.7k
            } else {
1470
53.7k
                IndexKey key = std::make_tuple(_indexes.back()->index_type(), col_uid,
1471
53.7k
                                               _indexes.back()->get_index_suffix());
1472
53.7k
                _col_id_suffix_to_index[key].push_back(index_pos);
1473
53.7k
            }
1474
55.1k
        }
1475
55.1k
    }
1476
1477
56.3k
    if (has_bf_columns) {
1478
3.46k
        _has_bf_fpp = true;
1479
3.46k
        _bf_fpp = ori_tablet_schema.bloom_filter_fpp();
1480
52.9k
    } else {
1481
52.9k
        _has_bf_fpp = false;
1482
52.9k
        _bf_fpp = BLOOM_FILTER_DEFAULT_FPP;
1483
52.9k
    }
1484
56.3k
}
1485
1486
8.13k
void TabletSchema::merge_dropped_columns(const TabletSchema& src_schema) {
1487
    // If they are the same tablet schema object, then just return
1488
8.13k
    if (this == &src_schema) {
1489
0
        return;
1490
0
    }
1491
59.8k
    for (const auto& src_col : src_schema.columns()) {
1492
59.8k
        if (_field_uniqueid_to_index.find(src_col->unique_id()) == _field_uniqueid_to_index.end()) {
1493
51
            CHECK(!src_col->is_key())
1494
0
                    << src_col->name() << " is key column, should not be dropped.";
1495
51
            ColumnPB src_col_pb;
1496
            // There are some pointer in tablet column, not sure the reference relation, so
1497
            // that deep copy it.
1498
51
            src_col->to_schema_pb(&src_col_pb);
1499
51
            TabletColumn new_col(src_col_pb);
1500
51
            append_column(new_col, TabletSchema::ColumnType::DROPPED);
1501
51
        }
1502
59.8k
    }
1503
8.13k
}
1504
1505
8.65k
TabletSchemaSPtr TabletSchema::copy_without_variant_extracted_columns() {
1506
8.65k
    TabletSchemaSPtr copy = std::make_shared<TabletSchema>();
1507
8.65k
    copy->shawdow_copy_without_columns(*this);
1508
62.0k
    for (auto& col : this->columns()) {
1509
62.0k
        if (col->is_extracted_column()) {
1510
4.58k
            continue;
1511
4.58k
        }
1512
57.5k
        copy->append_column(*col);
1513
57.5k
    }
1514
8.65k
    return copy;
1515
8.65k
}
1516
1517
// Dropped column is in _field_uniqueid_to_index but not in _field_name_to_index
1518
// Could refer to append_column method
1519
577k
bool TabletSchema::is_dropped_column(const TabletColumn& col) const {
1520
18.4E
    CHECK(_field_uniqueid_to_index.find(col.unique_id()) != _field_uniqueid_to_index.end())
1521
18.4E
            << "could not find col with unique id = " << col.unique_id()
1522
18.4E
            << " and name = " << col.name() << " table_id=" << _table_id;
1523
577k
    auto it = _field_name_to_index.find(StringRef {col.name()});
1524
578k
    return it == _field_name_to_index.end() || _cols[it->second]->unique_id() != col.unique_id();
1525
577k
}
1526
1527
113
void TabletSchema::copy_extracted_columns(const TabletSchema& src_schema) {
1528
113
    std::unordered_set<int32_t> variant_columns;
1529
379
    for (const auto& col : columns()) {
1530
379
        if (col->is_variant_type()) {
1531
236
            variant_columns.insert(col->unique_id());
1532
236
        }
1533
379
    }
1534
312
    for (const TabletColumnPtr& col : src_schema.columns()) {
1535
312
        if (col->is_extracted_column() && variant_columns.contains(col->parent_unique_id())) {
1536
0
            ColumnPB col_pb;
1537
0
            col->to_schema_pb(&col_pb);
1538
0
            TabletColumn new_col(col_pb);
1539
0
            append_column(new_col, ColumnType::VARIANT);
1540
0
        }
1541
312
    }
1542
113
}
1543
1544
106
void TabletSchema::reserve_extracted_columns() {
1545
625
    for (auto it = _cols.begin(); it != _cols.end();) {
1546
519
        if (!(*it)->is_extracted_column()) {
1547
217
            it = _cols.erase(it);
1548
302
        } else {
1549
302
            ++it;
1550
302
        }
1551
519
    }
1552
106
}
1553
1554
3.81M
void TabletSchema::to_schema_pb(TabletSchemaPB* tablet_schema_pb) const {
1555
3.81M
    for (const auto& i : _cluster_key_uids) {
1556
101k
        tablet_schema_pb->add_cluster_key_uids(i);
1557
101k
    }
1558
3.81M
    tablet_schema_pb->set_keys_type(_keys_type);
1559
43.7M
    for (const auto& col : _cols) {
1560
43.7M
        ColumnPB* column = tablet_schema_pb->add_column();
1561
43.7M
        col->to_schema_pb(column);
1562
43.7M
    }
1563
3.81M
    for (const auto& index : _indexes) {
1564
1.61M
        auto* index_pb = tablet_schema_pb->add_index();
1565
1.61M
        index->to_schema_pb(index_pb);
1566
1.61M
    }
1567
3.81M
    tablet_schema_pb->set_num_short_key_columns(cast_set<int32_t>(_num_short_key_columns));
1568
3.81M
    tablet_schema_pb->set_num_rows_per_row_block(cast_set<int32_t>(_num_rows_per_row_block));
1569
3.81M
    tablet_schema_pb->set_compress_kind(_compress_kind);
1570
3.81M
    if (_has_bf_fpp) {
1571
1.59M
        tablet_schema_pb->set_bf_fpp(_bf_fpp);
1572
1.59M
    }
1573
3.81M
    tablet_schema_pb->set_next_column_unique_id(cast_set<uint32_t>(_next_column_unique_id));
1574
3.81M
    tablet_schema_pb->set_is_in_memory(_is_in_memory);
1575
3.81M
    tablet_schema_pb->set_disable_auto_compaction(_disable_auto_compaction);
1576
3.81M
    tablet_schema_pb->set_store_row_column(_store_row_column);
1577
3.81M
    tablet_schema_pb->set_skip_write_index_on_load(_skip_write_index_on_load);
1578
3.81M
    tablet_schema_pb->set_delete_sign_idx(_delete_sign_idx);
1579
3.81M
    tablet_schema_pb->set_sequence_col_idx(_sequence_col_idx);
1580
3.81M
    tablet_schema_pb->set_sort_type(_sort_type);
1581
3.81M
    tablet_schema_pb->set_sort_col_num(cast_set<int32_t>(_sort_col_num));
1582
3.81M
    tablet_schema_pb->set_schema_version(_schema_version);
1583
3.81M
    tablet_schema_pb->set_compression_type(_compression_type);
1584
3.81M
    tablet_schema_pb->set_row_store_page_size(_row_store_page_size);
1585
3.81M
    tablet_schema_pb->set_storage_page_size(_storage_page_size);
1586
3.81M
    tablet_schema_pb->set_storage_dict_page_size(_storage_dict_page_size);
1587
3.81M
    tablet_schema_pb->set_version_col_idx(_version_col_idx);
1588
3.81M
    tablet_schema_pb->set_skip_bitmap_col_idx(_skip_bitmap_col_idx);
1589
3.81M
    tablet_schema_pb->set_binlog_lsn_col_idx(_binlog_lsn_col_idx);
1590
3.81M
    tablet_schema_pb->set_binlog_op_col_idx(_binlog_op_col_idx);
1591
3.81M
    tablet_schema_pb->set_binlog_tso_col_idx(_binlog_tso_col_idx);
1592
3.81M
    tablet_schema_pb->set_inverted_index_storage_format(_inverted_index_storage_format);
1593
3.81M
    tablet_schema_pb->mutable_row_store_column_unique_ids()->Assign(
1594
3.81M
            _row_store_column_unique_ids.begin(), _row_store_column_unique_ids.end());
1595
3.81M
    tablet_schema_pb->set_enable_variant_flatten_nested(_deprecated_enable_variant_flatten_nested);
1596
3.81M
    tablet_schema_pb->set_storage_format(_storage_format);
1597
    // Backward downgrade safety: if a new BE rewrites tablet_meta.json carrying only
1598
    // storage_format and the deployment is then rolled back to an old BE, the old BE
1599
    // does not know the new field and would default-derive V2 for a V3 tablet, causing
1600
    // it to write V2-encoded segments into a V3 tablet. Redundantly emit the three
1601
    // legacy V3-flavor flags so old BEs can recover the format via the prior "any of
1602
    // these implies V3" rule. ~3 bytes per schema PB; only paid for V3 tablets.
1603
3.81M
    if (_storage_format == TabletStorageFormatPB::TABLET_STORAGE_FORMAT_V3) {
1604
1.02M
        tablet_schema_pb->set_is_external_segment_column_meta_used(true);
1605
1.02M
        tablet_schema_pb->set_integer_type_default_use_plain_encoding(true);
1606
1.02M
        tablet_schema_pb->set_binary_plain_encoding_default_impl(
1607
1.02M
                BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2);
1608
1.02M
    }
1609
3.81M
    auto column_groups_pb = tablet_schema_pb->mutable_seq_map();
1610
3.81M
    for (const auto& it : _seq_col_uid_to_value_cols_uid) {
1611
684
        uint32_t key = it.first;
1612
684
        ColumnGroupPB* cg_pb = column_groups_pb->add_cg(); // ColumnGroupPB {key: {v1, v2, v3}}
1613
684
        cg_pb->set_sequence_column(key);
1614
1.05k
        for (auto v : it.second) {
1615
1.05k
            cg_pb->add_columns_in_group(v);
1616
1.05k
        }
1617
684
    }
1618
3.81M
}
1619
1620
128k
size_t TabletSchema::row_size() const {
1621
128k
    size_t size = 0;
1622
1.18M
    for (const auto& column : _cols) {
1623
1.18M
        size += column->length();
1624
1.18M
    }
1625
128k
    size += (_num_columns + 7) / 8;
1626
1627
128k
    return size;
1628
128k
}
1629
1630
11.4M
int32_t TabletSchema::field_index(const std::string& field_name) const {
1631
11.4M
    const auto& found = _field_name_to_index.find(StringRef(field_name));
1632
11.4M
    return (found == _field_name_to_index.end()) ? -1 : found->second;
1633
11.4M
}
1634
1635
15.1k
int32_t TabletSchema::field_index(const PathInData& path) const {
1636
15.1k
    const auto& found = _field_path_to_index.find(PathInDataRef(&path));
1637
15.1k
    return (found == _field_path_to_index.end()) ? -1 : found->second;
1638
15.1k
}
1639
1640
36.7M
int32_t TabletSchema::field_index(int32_t col_unique_id) const {
1641
36.7M
    const auto& found = _field_uniqueid_to_index.find(col_unique_id);
1642
36.7M
    return (found == _field_uniqueid_to_index.end()) ? -1 : found->second;
1643
36.7M
}
1644
1645
48.7M
const std::vector<TabletColumnPtr>& TabletSchema::columns() const {
1646
48.7M
    return _cols;
1647
48.7M
}
1648
1649
161M
const TabletColumn& TabletSchema::column(size_t ordinal) const {
1650
18.4E
    DCHECK(ordinal < _num_columns) << "ordinal:" << ordinal << ", _num_columns:" << _num_columns;
1651
161M
    return *_cols[ordinal];
1652
161M
}
1653
1654
1.99M
const TabletColumn& TabletSchema::column_by_uid(int32_t col_unique_id) const {
1655
1.99M
    return *_cols.at(_field_uniqueid_to_index.at(col_unique_id));
1656
1.99M
}
1657
1658
9
TabletColumn& TabletSchema::mutable_column_by_uid(int32_t col_unique_id) {
1659
9
    return *_cols.at(_field_uniqueid_to_index.at(col_unique_id));
1660
9
}
1661
1662
89.8k
TabletColumn& TabletSchema::mutable_column(size_t ordinal) {
1663
89.8k
    return *_cols.at(ordinal);
1664
89.8k
}
1665
1666
1.44M
void TabletSchema::update_indexes_from_thrift(const std::vector<doris::TOlapTableIndex>& tindexes) {
1667
1.44M
    std::vector<TabletIndexPtr> indexes;
1668
1.44M
    for (const auto& tindex : tindexes) {
1669
299k
        TabletIndex index;
1670
299k
        index.init_from_thrift(tindex, *this);
1671
299k
        indexes.emplace_back(std::make_shared<TabletIndex>(std::move(index)));
1672
299k
    }
1673
1.44M
    _indexes = std::move(indexes);
1674
1.44M
    _col_id_suffix_to_index.clear();
1675
1.44M
    _index_by_unique_id_with_pattern.clear();
1676
1.44M
    size_t index_pos = 0;
1677
1.44M
    for (auto& index : _indexes) {
1678
299k
        for (int32_t col_uid : index->col_unique_ids()) {
1679
299k
            if (auto field_pattern = index->field_pattern(); !field_pattern.empty()) {
1680
4.68k
                auto& pattern_to_index_map = _index_by_unique_id_with_pattern[col_uid];
1681
4.68k
                pattern_to_index_map[field_pattern].emplace_back(index);
1682
294k
            } else {
1683
294k
                IndexKey key =
1684
294k
                        std::make_tuple(index->index_type(), col_uid, index->get_index_suffix());
1685
294k
                _col_id_suffix_to_index[key].push_back(index_pos);
1686
294k
            }
1687
299k
        }
1688
299k
        index_pos++;
1689
299k
    }
1690
1.44M
}
1691
1692
5.63k
bool TabletSchema::exist_column(const std::string& field_name) const {
1693
5.63k
    return _field_name_to_index.contains(StringRef {field_name});
1694
5.63k
}
1695
1696
29.5M
bool TabletSchema::has_column_unique_id(int32_t col_unique_id) const {
1697
29.5M
    return _field_uniqueid_to_index.contains(col_unique_id);
1698
29.5M
}
1699
1700
4.18k
Status TabletSchema::have_column(const std::string& field_name) const {
1701
4.18k
    if (!_field_name_to_index.contains(StringRef(field_name))) {
1702
4.04k
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
1703
4.04k
                "Not found field_name, field_name:{}, schema:{}", field_name,
1704
4.04k
                get_all_field_names());
1705
4.04k
    }
1706
132
    return Status::OK();
1707
4.18k
}
1708
1709
5.80k
Result<const TabletColumn*> TabletSchema::column(const std::string& field_name) const {
1710
5.80k
    auto it = _field_name_to_index.find(StringRef {field_name});
1711
5.80k
    if (it == _field_name_to_index.end()) {
1712
0
        DCHECK(false) << "field_name=" << field_name << ", table_id=" << _table_id
1713
0
                      << ", field_name_to_index=" << get_all_field_names();
1714
0
        return ResultError(
1715
0
                Status::InternalError("column not found, name={}, table_id={}, schema_version={}",
1716
0
                                      field_name, _table_id, _schema_version));
1717
0
    }
1718
5.80k
    return _cols[it->second].get();
1719
5.80k
}
1720
1721
void TabletSchema::update_tablet_columns(const TabletSchema& tablet_schema,
1722
14.2k
                                         const std::vector<TColumn>& t_columns) {
1723
14.2k
    copy_from(tablet_schema);
1724
14.2k
    if (!t_columns.empty() && t_columns[0].col_unique_id >= 0) {
1725
14.1k
        clear_columns();
1726
145k
        for (const auto& column : t_columns) {
1727
145k
            append_column(TabletColumn(column));
1728
145k
        }
1729
14.1k
    }
1730
14.2k
}
1731
1732
67
bool TabletSchema::has_inverted_index_with_index_id(int64_t index_id) const {
1733
86
    for (size_t i = 0; i < _indexes.size(); i++) {
1734
48
        if ((_indexes[i]->index_type() == IndexType::INVERTED ||
1735
48
             _indexes[i]->index_type() == IndexType::ANN) &&
1736
48
            _indexes[i]->index_id() == index_id) {
1737
29
            return true;
1738
29
        }
1739
48
    }
1740
38
    return false;
1741
67
}
1742
1743
std::vector<const TabletIndex*> TabletSchema::inverted_indexs(
1744
29.3M
        int32_t col_unique_id, const std::string& suffix_path) const {
1745
29.3M
    std::vector<const TabletIndex*> result;
1746
29.3M
    const std::string escaped_suffix = escape_for_path_name(suffix_path);
1747
29.3M
    auto it = _col_id_suffix_to_index.find(
1748
29.3M
            std::make_tuple(IndexType::INVERTED, col_unique_id, escaped_suffix));
1749
29.3M
    if (it != _col_id_suffix_to_index.end()) {
1750
160k
        for (size_t pos : it->second) {
1751
160k
            if (pos < _indexes.size()) {
1752
160k
                result.push_back(_indexes[pos].get());
1753
160k
            }
1754
160k
        }
1755
159k
    }
1756
29.3M
    return result;
1757
29.3M
}
1758
1759
std::vector<TabletIndexPtr> TabletSchema::inverted_index_by_field_pattern(
1760
12.8k
        int32_t col_unique_id, const std::string& field_pattern) const {
1761
12.8k
    auto id_to_pattern_map = _index_by_unique_id_with_pattern.find(col_unique_id);
1762
12.8k
    if (id_to_pattern_map == _index_by_unique_id_with_pattern.end()) {
1763
6.60k
        return {};
1764
6.60k
    }
1765
6.24k
    auto pattern_to_index_map = id_to_pattern_map->second.find(field_pattern);
1766
6.24k
    if (pattern_to_index_map == id_to_pattern_map->second.end()) {
1767
1.21k
        return {};
1768
1.21k
    }
1769
5.03k
    return pattern_to_index_map->second;
1770
6.24k
}
1771
1772
29.1M
std::vector<const TabletIndex*> TabletSchema::inverted_indexs(const TabletColumn& col) const {
1773
    // Some columns(Float, Double, JSONB ...) from the variant do not support inverted index
1774
29.1M
    if (!segment_v2::IndexColumnWriter::check_support_inverted_index(col)) {
1775
60.7k
        return {};
1776
60.7k
    }
1777
1778
    // TODO use more efficient impl
1779
    // Use parent id if unique not assigned, this could happend when accessing subcolumns of variants
1780
29.0M
    int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id();
1781
29.0M
    std::vector<const TabletIndex*> result;
1782
29.0M
    if (result = inverted_indexs(col_unique_id, escape_for_path_name(col.suffix_path()));
1783
29.0M
        !result.empty()) {
1784
117k
        return result;
1785
117k
    }
1786
    // variant's typed column has it's own index
1787
28.9M
    else if (col.is_extracted_column() && col.path_info_ptr()->get_is_typed()) {
1788
457
        std::string relative_path = col.path_info_ptr()->copy_pop_front().get_path();
1789
457
        if (_path_set_info_map.find(col_unique_id) == _path_set_info_map.end()) {
1790
0
            return result;
1791
0
        }
1792
457
        const auto& path_set_info = _path_set_info_map.at(col_unique_id);
1793
457
        if (path_set_info.typed_path_set.find(relative_path) ==
1794
457
            path_set_info.typed_path_set.end()) {
1795
0
            return result;
1796
0
        }
1797
457
        for (const auto& index : path_set_info.typed_path_set.at(relative_path).indexes) {
1798
44
            result.push_back(index.get());
1799
44
        }
1800
457
        return result;
1801
457
    }
1802
    // variant's subcolumns has it's own index
1803
28.9M
    else if (col.is_extracted_column()) {
1804
2.44k
        std::string relative_path = col.path_info_ptr()->copy_pop_front().get_path();
1805
2.44k
        if (_path_set_info_map.find(col_unique_id) == _path_set_info_map.end()) {
1806
5
            return result;
1807
5
        }
1808
2.44k
        const auto& path_set_info = _path_set_info_map.at(col_unique_id);
1809
2.44k
        if (path_set_info.subcolumn_indexes.find(relative_path) ==
1810
2.44k
            path_set_info.subcolumn_indexes.end()) {
1811
1.24k
            return result;
1812
1.24k
        }
1813
1.19k
        for (const auto& index : path_set_info.subcolumn_indexes.at(relative_path)) {
1814
38
            result.push_back(index.get());
1815
38
        }
1816
1.19k
    }
1817
28.9M
    return result;
1818
29.0M
}
1819
1820
const TabletIndex* TabletSchema::ann_index(int32_t col_unique_id,
1821
97.5k
                                           const std::string& suffix_path) const {
1822
171k
    for (size_t i = 0; i < _indexes.size(); i++) {
1823
73.7k
        if (_indexes[i]->index_type() == IndexType::ANN) {
1824
219
            for (int32_t id : _indexes[i]->col_unique_ids()) {
1825
219
                if (id == col_unique_id &&
1826
219
                    _indexes[i]->get_index_suffix() == escape_for_path_name(suffix_path)) {
1827
214
                    return _indexes[i].get();
1828
214
                }
1829
219
            }
1830
216
        }
1831
73.7k
    }
1832
97.3k
    return nullptr;
1833
97.5k
}
1834
1835
28.2M
const TabletIndex* TabletSchema::ann_index(const TabletColumn& col) const {
1836
28.2M
    if (!segment_v2::IndexColumnWriter::check_support_ann_index(col)) {
1837
28.1M
        return nullptr;
1838
28.1M
    }
1839
    // TODO use more efficient impl
1840
    // Use parent id if unique not assigned, this could happend when accessing subcolumns of variants
1841
105k
    int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id();
1842
105k
    return ann_index(col_unique_id, escape_for_path_name(col.suffix_path()));
1843
28.2M
}
1844
1845
0
bool TabletSchema::has_ngram_bf_index(int32_t col_unique_id) const {
1846
0
    IndexKey index_key(IndexType::NGRAM_BF, col_unique_id, "");
1847
0
    auto it = _col_id_suffix_to_index.find(index_key);
1848
0
    return it != _col_id_suffix_to_index.end();
1849
0
}
1850
1851
826k
const TabletIndex* TabletSchema::get_ngram_bf_index(int32_t col_unique_id) const {
1852
    // Get the ngram bf index for the given column unique id
1853
826k
    IndexKey index_key(IndexType::NGRAM_BF, col_unique_id, "");
1854
826k
    auto it = _col_id_suffix_to_index.find(index_key);
1855
826k
    if (it != _col_id_suffix_to_index.end()) {
1856
3.41k
        if (!it->second.empty() && it->second[0] < _indexes.size()) {
1857
3.41k
            return _indexes[it->second[0]].get();
1858
3.41k
        }
1859
3.41k
    }
1860
823k
    return nullptr;
1861
826k
}
1862
1863
const TabletIndex* TabletSchema::get_index(int32_t col_unique_id, IndexType index_type,
1864
14
                                           const std::string& suffix_path) const {
1865
14
    IndexKey index_key(index_type, col_unique_id, suffix_path);
1866
14
    auto it = _col_id_suffix_to_index.find(index_key);
1867
14
    if (it != _col_id_suffix_to_index.end()) {
1868
12
        if (!it->second.empty() && it->second[0] < _indexes.size()) {
1869
12
            return _indexes[it->second[0]].get();
1870
12
        }
1871
12
    }
1872
2
    return nullptr;
1873
14
}
1874
1875
Block TabletSchema::create_block(
1876
        const std::vector<uint32_t>& return_columns,
1877
3.67M
        const std::unordered_set<uint32_t>* tablet_columns_need_convert_null) const {
1878
3.67M
    Block block;
1879
49.1M
    for (int i = 0; i < return_columns.size(); ++i) {
1880
45.4M
        const ColumnId cid = return_columns[i];
1881
45.4M
        const auto& col = *_cols[cid];
1882
45.4M
        bool is_nullable = (tablet_columns_need_convert_null != nullptr &&
1883
45.4M
                            tablet_columns_need_convert_null->find(cid) !=
1884
45.3M
                                    tablet_columns_need_convert_null->end());
1885
45.4M
        auto data_type = DataTypeFactory::instance().create_data_type(col, is_nullable);
1886
45.4M
        if (col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT ||
1887
45.4M
            col.type() == FieldType::OLAP_FIELD_TYPE_MAP ||
1888
45.4M
            col.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) {
1889
128k
            if (_pruned_columns_data_type.contains(col.unique_id())) {
1890
124k
                data_type = _pruned_columns_data_type.at(col.unique_id());
1891
124k
            }
1892
128k
        }
1893
1894
45.4M
        if (_vir_col_idx_to_unique_id.contains(cid)) {
1895
365
            block.insert({ColumnNothing::create(0), data_type, col.name()});
1896
18.4E
            VLOG_DEBUG << fmt::format(
1897
18.4E
                    "Create block from tablet schema, column cid {} is virtual column, col_name: "
1898
18.4E
                    "{}, col_unique_id: {}, type {}",
1899
18.4E
                    cid, col.name(), col.unique_id(), data_type->get_name());
1900
45.4M
        } else {
1901
45.4M
            block.insert({data_type->create_column(), data_type, col.name()});
1902
45.4M
        }
1903
45.4M
    }
1904
3.67M
    return block;
1905
3.67M
}
1906
1907
54.4k
Block TabletSchema::create_block() const {
1908
54.4k
    Block block;
1909
578k
    for (const auto& col : _cols) {
1910
578k
        if (is_dropped_column(*col)) {
1911
13
            continue;
1912
13
        }
1913
1914
578k
        auto data_type = DataTypeFactory::instance().create_data_type(*col);
1915
578k
        if (col->type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
1916
1.05k
            if (_pruned_columns_data_type.contains(col->unique_id())) {
1917
0
                data_type = _pruned_columns_data_type.at(col->unique_id());
1918
0
            }
1919
1.05k
        }
1920
578k
        block.insert({data_type->create_column(), data_type, col->name()});
1921
578k
    }
1922
54.4k
    return block;
1923
54.4k
}
1924
1925
2.68k
Block TabletSchema::create_block_by_cids(const std::vector<uint32_t>& cids) const {
1926
2.68k
    Block block;
1927
17.7k
    for (const auto& cid : cids) {
1928
17.7k
        const auto& col = *_cols[cid];
1929
17.7k
        auto data_type = DataTypeFactory::instance().create_data_type(col);
1930
17.7k
        if (col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
1931
17
            if (_pruned_columns_data_type.contains(col.unique_id())) {
1932
0
                data_type = _pruned_columns_data_type.at(col.unique_id());
1933
0
            }
1934
17
        }
1935
17.7k
        block.insert({data_type->create_column(), data_type, col.name()});
1936
17.7k
    }
1937
2.68k
    return block;
1938
2.68k
}
1939
1940
845
bool operator==(const TabletColumn& a, const TabletColumn& b) {
1941
845
    if (a._unique_id != b._unique_id) return false;
1942
845
    if (a._col_name != b._col_name) return false;
1943
845
    if (a._type != b._type) return false;
1944
845
    if (a._is_key != b._is_key) return false;
1945
845
    if (a._aggregation != b._aggregation) return false;
1946
845
    if (a._is_nullable != b._is_nullable) return false;
1947
845
    if (a._has_default_value != b._has_default_value) return false;
1948
845
    if (a._has_default_value) {
1949
421
        if (a._default_value != b._default_value) return false;
1950
421
    }
1951
845
    if (a._is_decimal != b._is_decimal) return false;
1952
846
    if (a._is_decimal) {
1953
846
        if (a._precision != b._precision) return false;
1954
846
        if (a._frac != b._frac) return false;
1955
846
    }
1956
845
    if (a._length != b._length) return false;
1957
845
    if (a._index_length != b._index_length) return false;
1958
845
    if (a._is_bf_column != b._is_bf_column) return false;
1959
845
    if (a._column_path == nullptr && a._column_path != nullptr) return false;
1960
845
    if (b._column_path == nullptr && a._column_path != nullptr) return false;
1961
845
    if (b._column_path != nullptr && a._column_path != nullptr &&
1962
845
        *a._column_path != *b._column_path)
1963
0
        return false;
1964
845
    return true;
1965
845
}
1966
1967
846
bool operator!=(const TabletColumn& a, const TabletColumn& b) {
1968
846
    return !(a == b);
1969
846
}
1970
1971
111
bool operator==(const TabletSchema& a, const TabletSchema& b) {
1972
111
    if (a._keys_type != b._keys_type) return false;
1973
111
    if (a._cols.size() != b._cols.size()) return false;
1974
957
    for (int i = 0; i < a._cols.size(); ++i) {
1975
846
        if (*a._cols[i] != *b._cols[i]) return false;
1976
846
    }
1977
111
    if (a._num_columns != b._num_columns) return false;
1978
111
    if (a._num_key_columns != b._num_key_columns) return false;
1979
111
    if (a._num_null_columns != b._num_null_columns) return false;
1980
111
    if (a._num_short_key_columns != b._num_short_key_columns) return false;
1981
111
    if (a._num_rows_per_row_block != b._num_rows_per_row_block) return false;
1982
111
    if (a._compress_kind != b._compress_kind) return false;
1983
111
    if (a._next_column_unique_id != b._next_column_unique_id) return false;
1984
111
    if (a._has_bf_fpp != b._has_bf_fpp) return false;
1985
111
    if (a._has_bf_fpp) {
1986
9
        if (std::abs(a._bf_fpp - b._bf_fpp) > 1e-6) return false;
1987
9
    }
1988
111
    if (a._is_in_memory != b._is_in_memory) return false;
1989
111
    if (a._delete_sign_idx != b._delete_sign_idx) return false;
1990
111
    if (a._sequence_col_idx != b._sequence_col_idx) return false;
1991
111
    if (a._version_col_idx != b._version_col_idx) return false;
1992
111
    if (a._skip_bitmap_col_idx != b._skip_bitmap_col_idx) return false;
1993
111
    if (a._binlog_lsn_col_idx != b._binlog_lsn_col_idx) return false;
1994
111
    if (a._binlog_op_col_idx != b._binlog_op_col_idx) return false;
1995
111
    if (a._binlog_tso_col_idx != b._binlog_tso_col_idx) return false;
1996
111
    if (a._disable_auto_compaction != b._disable_auto_compaction) return false;
1997
111
    if (a._store_row_column != b._store_row_column) return false;
1998
111
    if (a._row_store_page_size != b._row_store_page_size) return false;
1999
111
    if (a._storage_page_size != b._storage_page_size) return false;
2000
111
    if (a._storage_dict_page_size != b._storage_dict_page_size) return false;
2001
111
    if (a._skip_write_index_on_load != b._skip_write_index_on_load) return false;
2002
111
    if (a._deprecated_enable_variant_flatten_nested !=
2003
111
        b._deprecated_enable_variant_flatten_nested) {
2004
0
        return false;
2005
0
    }
2006
111
    if (a._storage_format != b._storage_format) return false;
2007
111
    return true;
2008
111
}
2009
2010
111
bool operator!=(const TabletSchema& a, const TabletSchema& b) {
2011
111
    return !(a == b);
2012
111
}
2013
} // namespace doris