be/src/format/parquet/bool_rle_decoder.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/parquet/bool_rle_decoder.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | |
22 | | #include <algorithm> |
23 | | #include <ostream> |
24 | | #include <string> |
25 | | |
26 | | #include "core/column/column_vector.h" |
27 | | #include "core/types.h" |
28 | | #include "format/parquet/parquet_common.h" |
29 | | #include "util/coding.h" |
30 | | #include "util/slice.h" |
31 | | |
32 | | namespace doris { |
33 | | #include "common/compile_check_begin.h" |
34 | 6 | Status BoolRLEDecoder::set_data(Slice* slice) { |
35 | 6 | _data = slice; |
36 | 6 | _num_bytes = slice->size; |
37 | 6 | _offset = 0; |
38 | 6 | if (_num_bytes < 4) { |
39 | 1 | return Status::IOError("Received invalid length : " + std::to_string(_num_bytes) + |
40 | 1 | " (corrupt data page?)"); |
41 | 1 | } |
42 | | // Load the first 4 bytes in little-endian, which indicates the length |
43 | 5 | const auto* data = reinterpret_cast<const uint8_t*>(_data->data); |
44 | 5 | uint32_t num_bytes = decode_fixed32_le(data); |
45 | 5 | if (num_bytes > static_cast<uint32_t>(_num_bytes - 4)) { |
46 | 0 | return Status::IOError("Received invalid number of bytes : " + std::to_string(num_bytes) + |
47 | 0 | " (corrupt data page?)"); |
48 | 0 | } |
49 | 5 | _num_bytes = num_bytes; |
50 | 5 | auto decoder_data = data + 4; |
51 | 5 | _decoder = RleDecoder<uint8_t>(decoder_data, num_bytes, 1); |
52 | 5 | return Status::OK(); |
53 | 5 | } |
54 | | |
55 | 1 | Status BoolRLEDecoder::skip_values(size_t num_values) { |
56 | 1 | _decoder.Skip(num_values); |
57 | 1 | return Status::OK(); |
58 | 1 | } |
59 | | |
60 | | Status BoolRLEDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, |
61 | 5 | ColumnSelectVector& select_vector, bool is_dict_filter) { |
62 | 5 | if (select_vector.has_filter()) { |
63 | 2 | return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter); |
64 | 3 | } else { |
65 | 3 | return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter); |
66 | 3 | } |
67 | 5 | } |
68 | | |
69 | | template <bool has_filter> |
70 | | Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, |
71 | 5 | ColumnSelectVector& select_vector, bool is_dict_filter) { |
72 | 5 | auto& column_data = assert_cast<ColumnUInt8*>(doris_column.get())->get_data(); |
73 | 5 | size_t data_index = column_data.size(); |
74 | 5 | column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); |
75 | 5 | size_t max_values = select_vector.num_values() - select_vector.num_nulls(); |
76 | 5 | _values.resize(max_values); |
77 | 5 | if (!_decoder.get_values(_values.data(), max_values)) { |
78 | 0 | return Status::IOError("Can't read enough booleans in rle decoder"); |
79 | 0 | } |
80 | 5 | size_t current_value_idx = 0; |
81 | 5 | ColumnSelectVector::DataReadType read_type; |
82 | 24 | while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { |
83 | 19 | switch (read_type) { |
84 | 9 | case ColumnSelectVector::CONTENT: { |
85 | 9 | bool value; // Can't use uint8_t directly, we should correct it. |
86 | 36 | for (size_t i = 0; i < run_length; ++i) { |
87 | 27 | DCHECK(current_value_idx < max_values) |
88 | 0 | << current_value_idx << " vs. " << max_values; |
89 | 27 | value = _values[current_value_idx++]; |
90 | 27 | column_data[data_index++] = (UInt8)value; |
91 | 27 | } |
92 | 9 | break; |
93 | 0 | } |
94 | 2 | case ColumnSelectVector::NULL_DATA: { |
95 | 2 | data_index += run_length; |
96 | 2 | break; |
97 | 0 | } |
98 | 8 | case ColumnSelectVector::FILTERED_CONTENT: { |
99 | 8 | current_value_idx += run_length; |
100 | 8 | break; |
101 | 0 | } |
102 | 0 | case ColumnSelectVector::FILTERED_NULL: { |
103 | 0 | break; |
104 | 0 | } |
105 | 19 | } |
106 | 19 | } |
107 | 5 | return Status::OK(); |
108 | 5 | } _ZN5doris14BoolRLEDecoder14_decode_valuesILb1EEENS_6StatusERNS_3COWINS_7IColumnEE11mutable_ptrIS4_EERSt10shared_ptrIKNS_9IDataTypeEERNS_18ColumnSelectVectorEb Line | Count | Source | 71 | 2 | ColumnSelectVector& select_vector, bool is_dict_filter) { | 72 | 2 | auto& column_data = assert_cast<ColumnUInt8*>(doris_column.get())->get_data(); | 73 | 2 | size_t data_index = column_data.size(); | 74 | 2 | column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); | 75 | 2 | size_t max_values = select_vector.num_values() - select_vector.num_nulls(); | 76 | 2 | _values.resize(max_values); | 77 | 2 | if (!_decoder.get_values(_values.data(), max_values)) { | 78 | 0 | return Status::IOError("Can't read enough booleans in rle decoder"); | 79 | 0 | } | 80 | 2 | size_t current_value_idx = 0; | 81 | 2 | ColumnSelectVector::DataReadType read_type; | 82 | 18 | while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { | 83 | 16 | switch (read_type) { | 84 | 6 | case ColumnSelectVector::CONTENT: { | 85 | 6 | bool value; // Can't use uint8_t directly, we should correct it. | 86 | 12 | for (size_t i = 0; i < run_length; ++i) { | 87 | 6 | DCHECK(current_value_idx < max_values) | 88 | 0 | << current_value_idx << " vs. " << max_values; | 89 | 6 | value = _values[current_value_idx++]; | 90 | 6 | column_data[data_index++] = (UInt8)value; | 91 | 6 | } | 92 | 6 | break; | 93 | 0 | } | 94 | 2 | case ColumnSelectVector::NULL_DATA: { | 95 | 2 | data_index += run_length; | 96 | 2 | break; | 97 | 0 | } | 98 | 8 | case ColumnSelectVector::FILTERED_CONTENT: { | 99 | 8 | current_value_idx += run_length; | 100 | 8 | break; | 101 | 0 | } | 102 | 0 | case ColumnSelectVector::FILTERED_NULL: { | 103 | 0 | break; | 104 | 0 | } | 105 | 16 | } | 106 | 16 | } | 107 | 2 | return Status::OK(); | 108 | 2 | } |
_ZN5doris14BoolRLEDecoder14_decode_valuesILb0EEENS_6StatusERNS_3COWINS_7IColumnEE11mutable_ptrIS4_EERSt10shared_ptrIKNS_9IDataTypeEERNS_18ColumnSelectVectorEb Line | Count | Source | 71 | 3 | ColumnSelectVector& select_vector, bool is_dict_filter) { | 72 | 3 | auto& column_data = assert_cast<ColumnUInt8*>(doris_column.get())->get_data(); | 73 | 3 | size_t data_index = column_data.size(); | 74 | 3 | column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); | 75 | 3 | size_t max_values = select_vector.num_values() - select_vector.num_nulls(); | 76 | 3 | _values.resize(max_values); | 77 | 3 | if (!_decoder.get_values(_values.data(), max_values)) { | 78 | 0 | return Status::IOError("Can't read enough booleans in rle decoder"); | 79 | 0 | } | 80 | 3 | size_t current_value_idx = 0; | 81 | 3 | ColumnSelectVector::DataReadType read_type; | 82 | 6 | while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { | 83 | 3 | switch (read_type) { | 84 | 3 | case ColumnSelectVector::CONTENT: { | 85 | 3 | bool value; // Can't use uint8_t directly, we should correct it. | 86 | 24 | for (size_t i = 0; i < run_length; ++i) { | 87 | 21 | DCHECK(current_value_idx < max_values) | 88 | 0 | << current_value_idx << " vs. " << max_values; | 89 | 21 | value = _values[current_value_idx++]; | 90 | 21 | column_data[data_index++] = (UInt8)value; | 91 | 21 | } | 92 | 3 | break; | 93 | 0 | } | 94 | 0 | case ColumnSelectVector::NULL_DATA: { | 95 | 0 | data_index += run_length; | 96 | 0 | break; | 97 | 0 | } | 98 | 0 | case ColumnSelectVector::FILTERED_CONTENT: { | 99 | 0 | current_value_idx += run_length; | 100 | 0 | break; | 101 | 0 | } | 102 | 0 | case ColumnSelectVector::FILTERED_NULL: { | 103 | 0 | break; | 104 | 0 | } | 105 | 3 | } | 106 | 3 | } | 107 | 3 | return Status::OK(); | 108 | 3 | } |
|
109 | | #include "common/compile_check_end.h" |
110 | | |
111 | | } // namespace doris |