be/src/format/text/text_reader.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/text/text_reader.h" |
19 | | |
20 | | #include <gen_cpp/PlanNodes_types.h> |
21 | | #include <gen_cpp/Types_types.h> |
22 | | #include <glog/logging.h> |
23 | | |
24 | | #include <cstddef> |
25 | | #include <vector> |
26 | | |
27 | | #include "common/compiler_util.h" // IWYU pragma: keep |
28 | | #include "common/status.h" |
29 | | #include "core/block/block.h" |
30 | | #include "exec/scan/scanner.h" |
31 | | #include "format/csv/csv_reader.h" |
32 | | #include "format/file_reader/new_plain_text_line_reader.h" |
33 | | #include "format/line_reader.h" |
34 | | #include "io/file_factory.h" |
35 | | #include "io/fs/buffered_reader.h" |
36 | | #include "io/fs/file_reader.h" |
37 | | #include "io/fs/s3_file_reader.h" |
38 | | #include "runtime/descriptors.h" |
39 | | #include "runtime/runtime_state.h" |
40 | | |
41 | | namespace doris { |
42 | | |
43 | 252k | void HiveTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>* splitted_values) { |
44 | 252k | if (_value_sep_len == 1) { |
45 | 252k | _split_field_single_char(line, splitted_values); |
46 | 18.4E | } else { |
47 | 18.4E | _split_field_multi_char(line, splitted_values); |
48 | 18.4E | } |
49 | 252k | } |
50 | | |
51 | | void HiveTextFieldSplitter::_split_field_single_char(const Slice& line, |
52 | 252k | std::vector<Slice>* splitted_values) { |
53 | 252k | const char* data = line.data; |
54 | 252k | const size_t size = line.size; |
55 | 252k | size_t value_start = 0; |
56 | 135M | for (size_t i = 0; i < size; ++i) { |
57 | 135M | if (data[i] == _value_sep[0]) { |
58 | | // hive will escape the field separator in string |
59 | 9.38M | if (_escape_char != 0 && i > 0 && data[i - 1] == _escape_char) { |
60 | 53 | continue; |
61 | 53 | } |
62 | 9.38M | process_value_func(data, value_start, i - value_start, _trimming_char, splitted_values); |
63 | 9.38M | value_start = i + _value_sep_len; |
64 | 9.38M | } |
65 | 135M | } |
66 | 252k | process_value_func(data, value_start, size - value_start, _trimming_char, splitted_values); |
67 | 252k | } |
68 | | |
69 | | void HiveTextFieldSplitter::_split_field_multi_char(const Slice& line, |
70 | 17 | std::vector<Slice>* splitted_values) { |
71 | 17 | const char* data = line.data; |
72 | 17 | const size_t size = line.size; |
73 | 17 | size_t start = 0; |
74 | | |
75 | 17 | std::vector<int> next(_value_sep_len); |
76 | 17 | next[0] = -1; |
77 | 50 | for (int i = 1, j = -1; i < (int)_value_sep_len; i++) { |
78 | 33 | while (j >= 0 && _value_sep[i] != _value_sep[j + 1]) { |
79 | 0 | j = next[j]; |
80 | 0 | } |
81 | 33 | if (_value_sep[i] == _value_sep[j + 1]) { |
82 | 22 | j++; |
83 | 22 | } |
84 | 33 | next[i] = j; |
85 | 33 | } |
86 | | |
87 | | // KMP search |
88 | 198 | for (int i = 0, j = -1; i < (int)size; i++) { |
89 | 200 | while (j >= 0 && data[i] != _value_sep[j + 1]) { |
90 | 19 | j = next[j]; |
91 | 19 | } |
92 | 181 | if (data[i] == _value_sep[j + 1]) { |
93 | 90 | j++; |
94 | 90 | } |
95 | 181 | if (j == (int)_value_sep_len - 1) { |
96 | 34 | size_t curpos = i - _value_sep_len + 1; |
97 | 34 | if (_escape_char != 0 && curpos > 0 && data[curpos - 1] == _escape_char) { |
98 | 2 | j = next[j]; |
99 | 2 | continue; |
100 | 2 | } |
101 | | |
102 | 32 | if (curpos >= start) { |
103 | 25 | process_value_func(data, start, curpos - start, _trimming_char, splitted_values); |
104 | 25 | start = curpos + _value_sep_len; |
105 | 25 | } |
106 | | |
107 | 32 | j = next[j]; |
108 | 32 | } |
109 | 181 | } |
110 | 17 | process_value_func(data, start, size - start, _trimming_char, splitted_values); |
111 | 17 | } |
112 | | |
113 | | TextReader::TextReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, |
114 | | const TFileScanRangeParams& params, const TFileRangeDesc& range, |
115 | | const std::vector<SlotDescriptor*>& file_slot_descs, io::IOContext* io_ctx) |
116 | 4.57k | : CsvReader(state, profile, counter, params, range, file_slot_descs, io_ctx) {} |
117 | | |
118 | 4.57k | Status TextReader::_init_options() { |
119 | | // get column_separator and line_delimiter |
120 | 4.57k | _value_separator = _params.file_attributes.text_params.column_separator; |
121 | 4.57k | _value_separator_length = _value_separator.size(); |
122 | 4.57k | _line_delimiter = _params.file_attributes.text_params.line_delimiter; |
123 | 4.57k | _line_delimiter_length = _line_delimiter.size(); |
124 | | |
125 | 4.57k | if (_params.file_attributes.text_params.__isset.escape) { |
126 | 3.42k | _escape = _params.file_attributes.text_params.escape; |
127 | 3.42k | } |
128 | 4.57k | _options.escape_char = _escape; |
129 | | |
130 | 4.57k | _options.collection_delim = _params.file_attributes.text_params.collection_delimiter[0]; |
131 | 4.57k | _options.map_key_delim = _params.file_attributes.text_params.mapkv_delimiter[0]; |
132 | | |
133 | 4.57k | _options.null_format = _params.file_attributes.text_params.null_format.data(); |
134 | 4.57k | _options.null_len = _params.file_attributes.text_params.null_format.length(); |
135 | | |
136 | 4.57k | return Status::OK(); |
137 | 4.57k | } |
138 | | |
139 | 6.77M | Status TextReader::_deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) { |
140 | 6.77M | return serde->deserialize_one_cell_from_hive_text(column, slice, _options); |
141 | 6.77M | } |
142 | | |
143 | 4.55k | Status TextReader::_create_line_reader() { |
144 | 4.55k | std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx; |
145 | | |
146 | 4.55k | text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(_line_delimiter, |
147 | 4.55k | _line_delimiter_length, false); |
148 | | |
149 | 4.55k | _fields_splitter = std::make_unique<HiveTextFieldSplitter>( |
150 | 4.55k | false, false, _value_separator, _value_separator_length, -1, _escape); |
151 | | |
152 | 4.55k | _line_reader = |
153 | 4.55k | NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(), |
154 | 4.55k | text_line_reader_ctx, _size, _start_offset); |
155 | | |
156 | 4.55k | return Status::OK(); |
157 | 4.55k | } |
158 | | |
159 | 391k | Status TextReader::_validate_line(const Slice& line, bool* success) { |
160 | | // text file do not need utf8 check |
161 | 391k | *success = true; |
162 | 391k | return Status::OK(); |
163 | 391k | } |
164 | | |
165 | 1.61M | Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) { |
166 | 1.61M | auto& null_column = assert_cast<ColumnNullable&>(column); |
167 | 1.61M | if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) { |
168 | 1.04k | null_column.insert_data(nullptr, 0); |
169 | 1.04k | return Status::OK(); |
170 | 1.04k | } |
171 | 1.61M | static DataTypeStringSerDe stringSerDe(TYPE_STRING); |
172 | 1.61M | auto st = stringSerDe.deserialize_one_cell_from_hive_text(null_column.get_nested_column(), |
173 | 1.61M | slice, _options); |
174 | 1.61M | if (!st.ok()) { |
175 | | // fill null if fail |
176 | 0 | null_column.insert_data(nullptr, 0); // 0 is meaningless here |
177 | 0 | return Status::OK(); |
178 | 0 | } |
179 | | // fill not null if success |
180 | 1.61M | null_column.get_null_map_data().push_back(0); |
181 | 1.61M | return Status::OK(); |
182 | 1.61M | } |
183 | | |
184 | | } // namespace doris |