be/src/format/parquet/bool_rle_decoder.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/parquet/bool_rle_decoder.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | |
22 | | #include <algorithm> |
23 | | #include <ostream> |
24 | | #include <string> |
25 | | |
26 | | #include "core/column/column_vector.h" |
27 | | #include "core/types.h" |
28 | | #include "format/parquet/parquet_common.h" |
29 | | #include "util/coding.h" |
30 | | #include "util/slice.h" |
31 | | |
32 | | namespace doris { |
33 | 6 | Status BoolRLEDecoder::set_data(Slice* slice) { |
34 | 6 | _data = slice; |
35 | 6 | _num_bytes = slice->size; |
36 | 6 | _offset = 0; |
37 | 6 | if (_num_bytes < 4) { |
38 | 1 | return Status::IOError("Received invalid length : " + std::to_string(_num_bytes) + |
39 | 1 | " (corrupt data page?)"); |
40 | 1 | } |
41 | | // Load the first 4 bytes in little-endian, which indicates the length |
42 | 5 | const auto* data = reinterpret_cast<const uint8_t*>(_data->data); |
43 | 5 | uint32_t num_bytes = decode_fixed32_le(data); |
44 | 5 | if (num_bytes > static_cast<uint32_t>(_num_bytes - 4)) { |
45 | 0 | return Status::IOError("Received invalid number of bytes : " + std::to_string(num_bytes) + |
46 | 0 | " (corrupt data page?)"); |
47 | 0 | } |
48 | 5 | _num_bytes = num_bytes; |
49 | 5 | auto decoder_data = data + 4; |
50 | 5 | _decoder = RleDecoder<uint8_t>(decoder_data, num_bytes, 1); |
51 | 5 | return Status::OK(); |
52 | 5 | } |
53 | | |
54 | 1 | Status BoolRLEDecoder::skip_values(size_t num_values) { |
55 | 1 | _decoder.Skip(num_values); |
56 | 1 | return Status::OK(); |
57 | 1 | } |
58 | | |
59 | | Status BoolRLEDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, |
60 | 5 | ColumnSelectVector& select_vector, bool is_dict_filter) { |
61 | 5 | if (select_vector.has_filter()) { |
62 | 2 | return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter); |
63 | 3 | } else { |
64 | 3 | return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter); |
65 | 3 | } |
66 | 5 | } |
67 | | |
68 | | template <bool has_filter> |
69 | | Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, |
70 | 5 | ColumnSelectVector& select_vector, bool is_dict_filter) { |
71 | 5 | auto& column_data = assert_cast<ColumnUInt8*>(doris_column.get())->get_data(); |
72 | 5 | size_t data_index = column_data.size(); |
73 | 5 | column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); |
74 | 5 | size_t max_values = select_vector.num_values() - select_vector.num_nulls(); |
75 | 5 | _values.resize(max_values); |
76 | 5 | if (!_decoder.get_values(_values.data(), max_values)) { |
77 | 0 | return Status::IOError("Can't read enough booleans in rle decoder"); |
78 | 0 | } |
79 | 5 | size_t current_value_idx = 0; |
80 | 5 | ColumnSelectVector::DataReadType read_type; |
81 | 24 | while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { |
82 | 19 | switch (read_type) { |
83 | 9 | case ColumnSelectVector::CONTENT: { |
84 | 9 | bool value; // Can't use uint8_t directly, we should correct it. |
85 | 36 | for (size_t i = 0; i < run_length; ++i) { |
86 | 27 | DCHECK(current_value_idx < max_values) |
87 | 0 | << current_value_idx << " vs. " << max_values; |
88 | 27 | value = _values[current_value_idx++]; |
89 | 27 | column_data[data_index++] = (UInt8)value; |
90 | 27 | } |
91 | 9 | break; |
92 | 0 | } |
93 | 2 | case ColumnSelectVector::NULL_DATA: { |
94 | 2 | data_index += run_length; |
95 | 2 | break; |
96 | 0 | } |
97 | 8 | case ColumnSelectVector::FILTERED_CONTENT: { |
98 | 8 | current_value_idx += run_length; |
99 | 8 | break; |
100 | 0 | } |
101 | 0 | case ColumnSelectVector::FILTERED_NULL: { |
102 | 0 | break; |
103 | 0 | } |
104 | 19 | } |
105 | 19 | } |
106 | 5 | return Status::OK(); |
107 | 5 | } _ZN5doris14BoolRLEDecoder14_decode_valuesILb1EEENS_6StatusERNS_3COWINS_7IColumnEE11mutable_ptrIS4_EERSt10shared_ptrIKNS_9IDataTypeEERNS_18ColumnSelectVectorEb Line | Count | Source | 70 | 2 | ColumnSelectVector& select_vector, bool is_dict_filter) { | 71 | 2 | auto& column_data = assert_cast<ColumnUInt8*>(doris_column.get())->get_data(); | 72 | 2 | size_t data_index = column_data.size(); | 73 | 2 | column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); | 74 | 2 | size_t max_values = select_vector.num_values() - select_vector.num_nulls(); | 75 | 2 | _values.resize(max_values); | 76 | 2 | if (!_decoder.get_values(_values.data(), max_values)) { | 77 | 0 | return Status::IOError("Can't read enough booleans in rle decoder"); | 78 | 0 | } | 79 | 2 | size_t current_value_idx = 0; | 80 | 2 | ColumnSelectVector::DataReadType read_type; | 81 | 18 | while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { | 82 | 16 | switch (read_type) { | 83 | 6 | case ColumnSelectVector::CONTENT: { | 84 | 6 | bool value; // Can't use uint8_t directly, we should correct it. | 85 | 12 | for (size_t i = 0; i < run_length; ++i) { | 86 | 6 | DCHECK(current_value_idx < max_values) | 87 | 0 | << current_value_idx << " vs. " << max_values; | 88 | 6 | value = _values[current_value_idx++]; | 89 | 6 | column_data[data_index++] = (UInt8)value; | 90 | 6 | } | 91 | 6 | break; | 92 | 0 | } | 93 | 2 | case ColumnSelectVector::NULL_DATA: { | 94 | 2 | data_index += run_length; | 95 | 2 | break; | 96 | 0 | } | 97 | 8 | case ColumnSelectVector::FILTERED_CONTENT: { | 98 | 8 | current_value_idx += run_length; | 99 | 8 | break; | 100 | 0 | } | 101 | 0 | case ColumnSelectVector::FILTERED_NULL: { | 102 | 0 | break; | 103 | 0 | } | 104 | 16 | } | 105 | 16 | } | 106 | 2 | return Status::OK(); | 107 | 2 | } |
_ZN5doris14BoolRLEDecoder14_decode_valuesILb0EEENS_6StatusERNS_3COWINS_7IColumnEE11mutable_ptrIS4_EERSt10shared_ptrIKNS_9IDataTypeEERNS_18ColumnSelectVectorEb Line | Count | Source | 70 | 3 | ColumnSelectVector& select_vector, bool is_dict_filter) { | 71 | 3 | auto& column_data = assert_cast<ColumnUInt8*>(doris_column.get())->get_data(); | 72 | 3 | size_t data_index = column_data.size(); | 73 | 3 | column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); | 74 | 3 | size_t max_values = select_vector.num_values() - select_vector.num_nulls(); | 75 | 3 | _values.resize(max_values); | 76 | 3 | if (!_decoder.get_values(_values.data(), max_values)) { | 77 | 0 | return Status::IOError("Can't read enough booleans in rle decoder"); | 78 | 0 | } | 79 | 3 | size_t current_value_idx = 0; | 80 | 3 | ColumnSelectVector::DataReadType read_type; | 81 | 6 | while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { | 82 | 3 | switch (read_type) { | 83 | 3 | case ColumnSelectVector::CONTENT: { | 84 | 3 | bool value; // Can't use uint8_t directly, we should correct it. | 85 | 24 | for (size_t i = 0; i < run_length; ++i) { | 86 | 21 | DCHECK(current_value_idx < max_values) | 87 | 0 | << current_value_idx << " vs. " << max_values; | 88 | 21 | value = _values[current_value_idx++]; | 89 | 21 | column_data[data_index++] = (UInt8)value; | 90 | 21 | } | 91 | 3 | break; | 92 | 0 | } | 93 | 0 | case ColumnSelectVector::NULL_DATA: { | 94 | 0 | data_index += run_length; | 95 | 0 | break; | 96 | 0 | } | 97 | 0 | case ColumnSelectVector::FILTERED_CONTENT: { | 98 | 0 | current_value_idx += run_length; | 99 | 0 | break; | 100 | 0 | } | 101 | 0 | case ColumnSelectVector::FILTERED_NULL: { | 102 | 0 | break; | 103 | 0 | } | 104 | 3 | } | 105 | 3 | } | 106 | 3 | return Status::OK(); | 107 | 3 | } |
|
108 | | |
109 | | } // namespace doris |