be/src/format/text/text_reader.cpp

Source
﻿// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format/text/text_reader.h"

#include <gen_cpp/PlanNodes_types.h>
#include <gen_cpp/Types_types.h>
#include <glog/logging.h>

#include <cstddef>
#include <vector>

#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/status.h"
#include "core/block/block.h"
#include "exec/scan/scanner.h"
#include "format/csv/csv_reader.h"
#include "format/file_reader/new_plain_text_line_reader.h"
#include "format/line_reader.h"
#include "io/file_factory.h"
#include "io/fs/buffered_reader.h"
#include "io/fs/file_reader.h"
#include "io/fs/s3_file_reader.h"
#include "runtime/descriptors.h"
#include "runtime/runtime_state.h"

namespace doris {

void HiveTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>* splitted_values) {
    if (_value_sep_len == 1) {
        _split_field_single_char(line, splitted_values);
    } else {
        _split_field_multi_char(line, splitted_values);
    }
}

void HiveTextFieldSplitter::_split_field_single_char(const Slice& line,
                                                     std::vector<Slice>* splitted_values) {
    const char* data = line.data;
    const size_t size = line.size;
    size_t value_start = 0;
    for (size_t i = 0; i < size; ++i) {
        if (data[i] == _value_sep[0]) {
            // hive will escape the field separator in string
            if (_escape_char != 0 && i > 0 && data[i - 1] == _escape_char) {
                continue;
            }
            process_value_func(data, value_start, i - value_start, _trimming_char, splitted_values);
            value_start = i + _value_sep_len;
        }
    }
    process_value_func(data, value_start, size - value_start, _trimming_char, splitted_values);
}

void HiveTextFieldSplitter::_split_field_multi_char(const Slice& line,
                                                    std::vector<Slice>* splitted_values) {
    const char* data = line.data;
    const size_t size = line.size;
    size_t start = 0;

    std::vector<int> next(_value_sep_len);
    next[0] = -1;
    for (int i = 1, j = -1; i < (int)_value_sep_len; i++) {
        while (j >= 0 && _value_sep[i] != _value_sep[j + 1]) {
            j = next[j];
        }
        if (_value_sep[i] == _value_sep[j + 1]) {
            j++;
        }
        next[i] = j;
    }

    // KMP search
    for (int i = 0, j = -1; i < (int)size; i++) {
        while (j >= 0 && data[i] != _value_sep[j + 1]) {
            j = next[j];
        }
        if (data[i] == _value_sep[j + 1]) {
            j++;
        }
        if (j == (int)_value_sep_len - 1) {
            size_t curpos = i - _value_sep_len + 1;
            if (_escape_char != 0 && curpos > 0 && data[curpos - 1] == _escape_char) {
                j = next[j];
                continue;
            }

            if (curpos >= start) {
                process_value_func(data, start, curpos - start, _trimming_char, splitted_values);
                start = curpos + _value_sep_len;
            }

            j = next[j];
        }
    }
    process_value_func(data, start, size - start, _trimming_char, splitted_values);
}

TextReader::TextReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter,
                       const TFileScanRangeParams& params, const TFileRangeDesc& range,
                       const std::vector<SlotDescriptor*>& file_slot_descs, io::IOContext* io_ctx)
        : CsvReader(state, profile, counter, params, range, file_slot_descs, io_ctx) {}

Status TextReader::_init_options() {
    // get column_separator and line_delimiter
    _value_separator = _params.file_attributes.text_params.column_separator;
    _value_separator_length = _value_separator.size();
    _line_delimiter = _params.file_attributes.text_params.line_delimiter;
    _line_delimiter_length = _line_delimiter.size();

    if (_params.file_attributes.text_params.__isset.escape) {
        _escape = _params.file_attributes.text_params.escape;
    }
    _options.escape_char = _escape;

    _options.collection_delim = _params.file_attributes.text_params.collection_delimiter[0];
    _options.map_key_delim = _params.file_attributes.text_params.mapkv_delimiter[0];

    _options.null_format = _params.file_attributes.text_params.null_format.data();
    _options.null_len = _params.file_attributes.text_params.null_format.length();

    return Status::OK();
}

Status TextReader::_deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) {
    return serde->deserialize_one_cell_from_hive_text(column, slice, _options);
}

Status TextReader::_create_line_reader() {
    std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx;

    text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(_line_delimiter,
                                                                    _line_delimiter_length, false);

    _fields_splitter = std::make_unique<HiveTextFieldSplitter>(
            false, false, _value_separator, _value_separator_length, -1, _escape);

    _line_reader =
            NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(),
                                                  text_line_reader_ctx, _size, _start_offset);

    return Status::OK();
}

Status TextReader::_validate_line(const Slice& line, bool* success) {
    // text file do not need utf8 check
    *success = true;
    return Status::OK();
}

Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) {
    auto& null_column = assert_cast<ColumnNullable&>(column);
    if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
        null_column.insert_data(nullptr, 0);
        return Status::OK();
    }
    static DataTypeStringSerDe stringSerDe(TYPE_STRING);
    auto st = stringSerDe.deserialize_one_cell_from_hive_text(null_column.get_nested_column(),
                                                              slice, _options);
    if (!st.ok()) {
        // fill null if fail
        null_column.insert_data(nullptr, 0); // 0 is meaningless here
        return Status::OK();
    }
    // fill not null if success
    null_column.get_null_map_data().push_back(0);
    return Status::OK();
}

} // namespace doris

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "format/text/text_reader.h"
19
20		#include <gen_cpp/PlanNodes_types.h>
21		#include <gen_cpp/Types_types.h>
22		#include <glog/logging.h>
23
24		#include <cstddef>
25		#include <vector>
26
27		#include "common/compiler_util.h" // IWYU pragma: keep
28		#include "common/status.h"
29		#include "core/block/block.h"
30		#include "exec/scan/scanner.h"
31		#include "format/csv/csv_reader.h"
32		#include "format/file_reader/new_plain_text_line_reader.h"
33		#include "format/line_reader.h"
34		#include "io/file_factory.h"
35		#include "io/fs/buffered_reader.h"
36		#include "io/fs/file_reader.h"
37		#include "io/fs/s3_file_reader.h"
38		#include "runtime/descriptors.h"
39		#include "runtime/runtime_state.h"
40
41		namespace doris {
42
43	252k	void HiveTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>* splitted_values) {
44	252k	if (_value_sep_len == 1) {
45	252k	_split_field_single_char(line, splitted_values);
46	18.4E	} else {
47	18.4E	_split_field_multi_char(line, splitted_values);
48	18.4E	}
49	252k	}
50
51		void HiveTextFieldSplitter::_split_field_single_char(const Slice& line,
52	252k	std::vector<Slice>* splitted_values) {
53	252k	const char* data = line.data;
54	252k	const size_t size = line.size;
55	252k	size_t value_start = 0;
56	135M	for (size_t i = 0; i < size; ++i) {
57	135M	if (data[i] == _value_sep[0]) {
58		// hive will escape the field separator in string
59	9.38M	if (_escape_char != 0 && i > 0 && data[i - 1] == _escape_char) {
60	53	continue;
61	53	}
62	9.38M	process_value_func(data, value_start, i - value_start, _trimming_char, splitted_values);
63	9.38M	value_start = i + _value_sep_len;
64	9.38M	}
65	135M	}
66	252k	process_value_func(data, value_start, size - value_start, _trimming_char, splitted_values);
67	252k	}
68
69		void HiveTextFieldSplitter::_split_field_multi_char(const Slice& line,
70	17	std::vector<Slice>* splitted_values) {
71	17	const char* data = line.data;
72	17	const size_t size = line.size;
73	17	size_t start = 0;
74
75	17	std::vector<int> next(_value_sep_len);
76	17	next[0] = -1;
77	50	for (int i = 1, j = -1; i < (int)_value_sep_len; i++) {
78	33	while (j >= 0 && _value_sep[i] != _value_sep[j + 1]) {
79	0	j = next[j];
80	0	}
81	33	if (_value_sep[i] == _value_sep[j + 1]) {
82	22	j++;
83	22	}
84	33	next[i] = j;
85	33	}
86
87		// KMP search
88	198	for (int i = 0, j = -1; i < (int)size; i++) {
89	200	while (j >= 0 && data[i] != _value_sep[j + 1]) {
90	19	j = next[j];
91	19	}
92	181	if (data[i] == _value_sep[j + 1]) {
93	90	j++;
94	90	}
95	181	if (j == (int)_value_sep_len - 1) {
96	34	size_t curpos = i - _value_sep_len + 1;
97	34	if (_escape_char != 0 && curpos > 0 && data[curpos - 1] == _escape_char) {
98	2	j = next[j];
99	2	continue;
100	2	}
101
102	32	if (curpos >= start) {
103	25	process_value_func(data, start, curpos - start, _trimming_char, splitted_values);
104	25	start = curpos + _value_sep_len;
105	25	}
106
107	32	j = next[j];
108	32	}
109	181	}
110	17	process_value_func(data, start, size - start, _trimming_char, splitted_values);
111	17	}
112
113		TextReader::TextReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter,
114		const TFileScanRangeParams& params, const TFileRangeDesc& range,
115		const std::vector<SlotDescriptor>& file_slot_descs, io::IOContext io_ctx)
116	4.57k	: CsvReader(state, profile, counter, params, range, file_slot_descs, io_ctx) {}
117
118	4.57k	Status TextReader::_init_options() {
119		// get column_separator and line_delimiter
120	4.57k	_value_separator = _params.file_attributes.text_params.column_separator;
121	4.57k	_value_separator_length = _value_separator.size();
122	4.57k	_line_delimiter = _params.file_attributes.text_params.line_delimiter;
123	4.57k	_line_delimiter_length = _line_delimiter.size();
124
125	4.57k	if (_params.file_attributes.text_params.__isset.escape) {
126	3.42k	_escape = _params.file_attributes.text_params.escape;
127	3.42k	}
128	4.57k	_options.escape_char = _escape;
129
130	4.57k	_options.collection_delim = _params.file_attributes.text_params.collection_delimiter[0];
131	4.57k	_options.map_key_delim = _params.file_attributes.text_params.mapkv_delimiter[0];
132
133	4.57k	_options.null_format = _params.file_attributes.text_params.null_format.data();
134	4.57k	_options.null_len = _params.file_attributes.text_params.null_format.length();
135
136	4.57k	return Status::OK();
137	4.57k	}
138
139	6.77M	Status TextReader::_deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) {
140	6.77M	return serde->deserialize_one_cell_from_hive_text(column, slice, _options);
141	6.77M	}
142
143	4.55k	Status TextReader::_create_line_reader() {
144	4.55k	std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx;
145
146	4.55k	text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(_line_delimiter,
147	4.55k	_line_delimiter_length, false);
148
149	4.55k	_fields_splitter = std::make_unique<HiveTextFieldSplitter>(
150	4.55k	false, false, _value_separator, _value_separator_length, -1, _escape);
151
152	4.55k	_line_reader =
153	4.55k	NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(),
154	4.55k	text_line_reader_ctx, _size, _start_offset);
155
156	4.55k	return Status::OK();
157	4.55k	}
158
159	391k	Status TextReader::_validate_line(const Slice& line, bool* success) {
160		// text file do not need utf8 check
161	391k	*success = true;
162	391k	return Status::OK();
163	391k	}
164
165	1.61M	Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) {
166	1.61M	auto& null_column = assert_cast<ColumnNullable&>(column);
167	1.61M	if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
168	1.04k	null_column.insert_data(nullptr, 0);
169	1.04k	return Status::OK();
170	1.04k	}
171	1.61M	static DataTypeStringSerDe stringSerDe(TYPE_STRING);
172	1.61M	auto st = stringSerDe.deserialize_one_cell_from_hive_text(null_column.get_nested_column(),
173	1.61M	slice, _options);
174	1.61M	if (!st.ok()) {
175		// fill null if fail
176	0	null_column.insert_data(nullptr, 0); // 0 is meaningless here
177	0	return Status::OK();
178	0	}
179		// fill not null if success
180	1.61M	null_column.get_null_map_data().push_back(0);
181	1.61M	return Status::OK();
182	1.61M	}
183
184		} // namespace doris

Coverage Report

Created: 2026-04-15 12:28