be/src/exec/common/variant_util.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <gen_cpp/Descriptors_types.h> |
21 | | #include <parallel_hashmap/phmap.h> |
22 | | |
23 | | #include <cstddef> |
24 | | #include <cstdint> |
25 | | #include <map> |
26 | | #include <string> |
27 | | #include <string_view> |
28 | | #include <unordered_map> |
29 | | #include <vector> |
30 | | |
31 | | #include "common/status.h" |
32 | | #include "core/column/column.h" |
33 | | #include "core/column/column_variant.h" |
34 | | #include "core/data_type/data_type.h" |
35 | | #include "core/field.h" |
36 | | #include "core/string_ref.h" |
37 | | #include "core/types.h" |
38 | | #include "exprs/aggregate/aggregate_function.h" |
39 | | #include "storage/tablet/tablet_fwd.h" |
40 | | #include "storage/tablet/tablet_schema.h" |
41 | | #include "util/json/json_parser.h" |
42 | | |
43 | | namespace doris { |
44 | | class TabletSchema; |
45 | | enum class FieldType; |
46 | | namespace segment_v2 { |
47 | | struct VariantStatisticsPB; |
48 | | } // namespace segment_v2 |
49 | | class Block; |
50 | | class IColumn; |
51 | | struct ColumnWithTypeAndName; |
52 | | class SimdJSONParser; |
53 | | enum class ExtractType; |
54 | | template <typename ParserImpl> |
55 | | class JSONDataParser; |
56 | | template <typename T> |
57 | | class ColumnStr; |
58 | | using ColumnString = ColumnStr<UInt32>; |
59 | | using JsonParser = JSONDataParser<SimdJSONParser>; |
60 | | } // namespace doris |
61 | | |
62 | | const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; |
63 | | const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__"; |
64 | | namespace doris::variant_util { |
65 | | |
66 | | // Convert a restricted glob pattern into a regex (for tests/internal use). |
67 | | Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern); |
68 | | |
69 | | // Match a glob pattern against a path using RE2. |
70 | | bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); |
71 | | |
72 | | using PathToNoneNullValues = std::unordered_map<std::string, int64_t>; |
73 | | using PathToDataTypes = std::unordered_map<PathInData, std::vector<DataTypePtr>, PathInData::Hash>; |
74 | | |
75 | 47.2k | inline bool should_record_variant_path_stats(const TabletColumn& column) { |
76 | 47.2k | return !column.variant_enable_nested_group(); |
77 | 47.2k | } |
78 | | |
79 | 11.3k | inline bool should_write_variant_binary_columns(const TabletColumn& column) { |
80 | 11.3k | return !column.variant_enable_nested_group(); |
81 | 11.3k | } |
82 | | |
83 | 12.0k | inline bool should_check_variant_path_stats(const TabletColumn& column) { |
84 | 12.0k | return !column.variant_enable_nested_group(); |
85 | 12.0k | } |
86 | | |
87 | | struct VariantExtendedInfo { |
88 | | PathToNoneNullValues path_to_none_null_values; // key: path, value: number of none null values |
89 | | std::unordered_set<std::string> sparse_paths; // sparse paths in this variant column |
90 | | std::unordered_set<std::string> typed_paths; // typed paths in this variant column |
91 | | std::unordered_set<PathInData, PathInData::Hash> |
92 | | nested_paths; // nested paths in this variant column |
93 | | PathToDataTypes path_to_data_types; // key: path, value: data types |
94 | | bool has_nested_group = false; // whether this variant column has nested group |
95 | | }; |
96 | | |
97 | | /// Returns number of dimensions in Array type. 0 if type is not array. |
98 | | size_t get_number_of_dimensions(const IDataType& type); |
99 | | |
100 | | /// Returns number of dimensions in Array column. 0 if column is not array. |
101 | | size_t get_number_of_dimensions(const IColumn& column); |
102 | | |
103 | | /// Returns type of scalars of Array of arbitrary dimensions. |
104 | | DataTypePtr get_base_type_of_array(const DataTypePtr& type); |
105 | | |
106 | | // Cast column to dst type |
107 | | Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result); |
108 | | |
109 | | struct ExtraInfo { |
110 | | // -1 indicates it's not a Frontend generated column |
111 | | int32_t unique_id = -1; |
112 | | int32_t parent_unique_id = -1; |
113 | | PathInData path_info; |
114 | | }; |
115 | | |
116 | | TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name, |
117 | | const ExtraInfo& ext_info); |
118 | | |
119 | | // check if the tuple_paths has ambiguous paths |
120 | | // situation: |
121 | | // throw exception if there exists a prefix with matched names, but not matched structure (is Nested, number of dimensions). |
122 | | Status check_variant_has_no_ambiguous_paths(const std::vector<PathInData>& paths); |
123 | | |
124 | | // Pick the tablet schema with the highest schema version as the reference. |
125 | | // Then update all variant columns to there least common types. |
126 | | // Return the final merged schema as common schema. |
127 | | // If base_schema == nullptr then, max schema version tablet schema will be picked as base schema |
128 | | Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas, |
129 | | const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& result, |
130 | | bool check_schema_size = false); |
131 | | |
132 | | // Get least common types for extracted columns which has Path info, |
133 | | // with a speicified variant column's unique id |
134 | | Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas, |
135 | | TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, |
136 | | std::set<PathInData>* path_set); |
137 | | |
138 | | // inherit attributes like index/agg info from it's parent column |
139 | | void inherit_column_attributes(TabletSchemaSPtr& schema); |
140 | | |
141 | | // source: variant column |
142 | | // target: extracted column from variant column |
143 | | void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, |
144 | | TabletSchemaSPtr* target_schema = nullptr); |
145 | | |
146 | | // Align variant subcolumn BF inheritance with FE BF-supported types. |
147 | | bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type); |
148 | | |
149 | | // get sorted subcolumns of variant |
150 | | ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns); |
151 | | |
152 | | bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema, |
153 | | int32_t new_col_idx, int32_t old_col_idx); |
154 | | |
155 | | // create ColumnMap<String, String> |
156 | | TabletColumn create_sparse_column(const TabletColumn& variant); |
157 | | |
158 | | // Create one bucket sparse column: name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b{index}" |
159 | | TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index); |
160 | | |
161 | | TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index); |
162 | | |
163 | | // Compute bucket id for given path string using SipHash64(path) % bucket_num. |
164 | | uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num); |
165 | | |
166 | | void get_field_info(const Field& field, FieldInfo* info); |
167 | | |
168 | | // inherit index from parent column |
169 | | bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes, |
170 | | TabletIndexes& sub_column_indexes, FieldType column_type, |
171 | | const std::string& suffix_path, bool is_array_nested_type = false); |
172 | | |
173 | | bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes, |
174 | | TabletIndexes& sub_column_indexes, const TabletColumn& column); |
175 | | |
176 | | bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes, |
177 | | TabletIndexes& sub_column_indexes, const segment_v2::ColumnMetaPB& column_pb); |
178 | | |
179 | | Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types, |
180 | | TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, |
181 | | const std::map<std::string, TabletColumnPtr>& typed_columns, |
182 | | std::set<PathInData>* path_set = nullptr); |
183 | | |
184 | | bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, |
185 | | const std::string& path, |
186 | | TabletSchema::SubColumnInfo* sub_column_info); |
187 | | |
188 | | class VariantCompactionUtil { |
189 | | public: |
190 | | // get the subpaths and sparse paths for the variant column |
191 | | static void get_subpaths(int32_t max_subcolumns_count, const PathToNoneNullValues& path_stats, |
192 | | TabletSchema::PathsSetInfo& paths_set_info); |
193 | | |
194 | | // collect extended info from the variant column |
195 | | static Status aggregate_variant_extended_info( |
196 | | const RowsetSharedPtr& rs, |
197 | | std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info); |
198 | | |
199 | | // collect path stats from the variant column |
200 | | static Status aggregate_path_to_stats( |
201 | | const RowsetSharedPtr& rs, |
202 | | std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats); |
203 | | |
204 | | // Build the temporary schema for compaction, this will reduce the memory usage of compacting variant columns |
205 | | static Status get_extended_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, |
206 | | TabletSchemaSPtr& target); |
207 | | |
208 | | // Used to collect all the subcolumns types of variant column from rowsets |
209 | | static TabletSchemaSPtr calculate_variant_extended_schema( |
210 | | const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema); |
211 | | |
212 | | // Check if the path stats are consistent between inputs rowsets and output rowset. |
213 | | // Used to check the correctness of compaction. |
214 | | static Status check_path_stats(const std::vector<RowsetSharedPtr>& intputs, |
215 | | RowsetSharedPtr output, BaseTabletSPtr tablet); |
216 | | |
217 | | // Calculate statistics about variant data paths from the encoded sparse column |
218 | | static void calculate_variant_stats(const IColumn& encoded_sparse_column, |
219 | | segment_v2::VariantStatisticsPB* stats, |
220 | | size_t max_sparse_column_statistics_size, size_t row_pos, |
221 | | size_t num_rows); |
222 | | |
223 | | static void get_compaction_subcolumns_from_subpaths( |
224 | | TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column, |
225 | | const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types, |
226 | | const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema); |
227 | | |
228 | | static void get_compaction_subcolumns_from_data_types( |
229 | | TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column, |
230 | | const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types, |
231 | | TabletSchemaSPtr& output_schema); |
232 | | |
233 | | static Status get_compaction_typed_columns(const TabletSchemaSPtr& target, |
234 | | const std::unordered_set<std::string>& typed_paths, |
235 | | const TabletColumnPtr parent_column, |
236 | | TabletSchemaSPtr& output_schema, |
237 | | TabletSchema::PathsSetInfo& paths_set_info); |
238 | | |
239 | | static Status get_compaction_nested_columns( |
240 | | const std::unordered_set<PathInData, PathInData::Hash>& nested_paths, |
241 | | const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column, |
242 | | TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info); |
243 | | }; |
244 | | |
245 | | // parse a batch of json strings into column object, throws doris::Execption when failed |
246 | | // only UT test |
247 | | void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column, |
248 | | const ParseConfig& config); |
249 | | |
250 | | // Parse variant columns by picking variant positions from `variant_pos` and using provided ParseConfigs. |
251 | | // only UT test |
252 | | Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos, |
253 | | const std::vector<ParseConfig>& configs); |
254 | | |
255 | | // parse a single json, throws doris::Execption when failed |
256 | | void parse_json_to_variant(IColumn& column, const StringRef& jsons, JsonParser* parser, |
257 | | const ParseConfig& config); |
258 | | |
259 | | // Parse variant columns by picking variant positions from `column_pos` and generating ParseConfig |
260 | | // based on tablet schema settings (flatten nested / doc snapshot mode). |
261 | | Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema, |
262 | | const std::vector<uint32_t>& column_pos); |
263 | | |
264 | | // Parse doc snapshot column (paths/values/offsets stored in ColumnVariant) into per-path subcolumns. |
265 | | // NOTE: Returned map keys are `std::string_view` pointing into the underlying doc snapshot paths |
266 | | // column, so the input `variant` must outlive the returned map. |
267 | | phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map( |
268 | | const ColumnVariant& variant); |
269 | | |
270 | | } // namespace doris::variant_util |