Coverage Report

Created: 2026-03-16 01:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/native/native_reader.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/native/native_reader.h"
19
20
#include <gen_cpp/data.pb.h>
21
22
#include "core/block/block.h"
23
#include "format/native/native_format.h"
24
#include "io/file_factory.h"
25
#include "io/fs/buffered_reader.h"
26
#include "io/fs/file_reader.h"
27
#include "io/fs/tracing_file_reader.h"
28
#include "runtime/runtime_profile.h"
29
#include "runtime/runtime_state.h"
30
31
namespace doris {
32
33
#include "common/compile_check_begin.h"
34
35
NativeReader::NativeReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
36
                           const TFileRangeDesc& range, io::IOContext* io_ctx, RuntimeState* state)
37
9
        : _profile(profile),
38
9
          _scan_params(params),
39
9
          _scan_range(range),
40
9
          _io_ctx(io_ctx),
41
9
          _state(state) {}
42
43
9
NativeReader::~NativeReader() {
44
9
    (void)close();
45
9
}
46
47
namespace {
48
49
Status validate_and_consume_header(io::FileReaderSPtr file_reader, const TFileRangeDesc& range,
50
9
                                   int64_t* file_size, int64_t* current_offset, bool* eof) {
51
9
    *file_size = file_reader->size();
52
9
    *current_offset = 0;
53
9
    *eof = (*file_size == 0);
54
55
    // Validate and consume Doris Native file header.
56
    // Expected layout:
57
    // [magic bytes "DORISN1\0"][uint32_t format_version][uint64_t block_size]...
58
9
    static constexpr size_t HEADER_SIZE = sizeof(DORIS_NATIVE_MAGIC) + sizeof(uint32_t);
59
9
    if (*eof || *file_size < static_cast<int64_t>(HEADER_SIZE)) {
60
0
        return Status::InternalError(
61
0
                "invalid Doris Native file {}, file size {} is smaller than header size {}",
62
0
                range.path, *file_size, HEADER_SIZE);
63
0
    }
64
65
9
    char header[HEADER_SIZE];
66
9
    Slice header_slice(header, sizeof(header));
67
9
    size_t bytes_read = 0;
68
9
    RETURN_IF_ERROR(file_reader->read_at(0, header_slice, &bytes_read));
69
9
    if (bytes_read != sizeof(header)) {
70
0
        return Status::InternalError(
71
0
                "failed to read Doris Native header from file {}, expect {} bytes, got {} bytes",
72
0
                range.path, sizeof(header), bytes_read);
73
0
    }
74
75
9
    if (memcmp(header, DORIS_NATIVE_MAGIC, sizeof(DORIS_NATIVE_MAGIC)) != 0) {
76
0
        return Status::InternalError("invalid Doris Native magic header in file {}", range.path);
77
0
    }
78
79
9
    uint32_t version = 0;
80
9
    memcpy(&version, header + sizeof(DORIS_NATIVE_MAGIC), sizeof(uint32_t));
81
9
    if (version != DORIS_NATIVE_FORMAT_VERSION) {
82
0
        return Status::InternalError(
83
0
                "unsupported Doris Native format version {} in file {}, expect {}", version,
84
0
                range.path, DORIS_NATIVE_FORMAT_VERSION);
85
0
    }
86
87
9
    *current_offset = sizeof(header);
88
9
    *eof = (*file_size == *current_offset);
89
9
    return Status::OK();
90
9
}
91
92
} // namespace
93
94
88
Status NativeReader::init_reader() {
95
88
    if (_file_reader != nullptr) {
96
79
        return Status::OK();
97
79
    }
98
99
    // Create underlying file reader. For now we always use random access mode.
100
9
    io::FileSystemProperties system_properties;
101
9
    io::FileDescription file_description;
102
9
    file_description.file_size = -1;
103
9
    if (_scan_range.__isset.file_size) {
104
9
        file_description.file_size = _scan_range.file_size;
105
9
    }
106
9
    file_description.path = _scan_range.path;
107
9
    if (_scan_range.__isset.fs_name) {
108
0
        file_description.fs_name = _scan_range.fs_name;
109
0
    }
110
9
    if (_scan_range.__isset.modification_time) {
111
0
        file_description.mtime = _scan_range.modification_time;
112
9
    } else {
113
9
        file_description.mtime = 0;
114
9
    }
115
116
9
    if (_scan_range.__isset.file_type) {
117
        // For compatibility with older FE.
118
9
        system_properties.system_type = _scan_range.file_type;
119
9
    } else {
120
0
        system_properties.system_type = _scan_params.file_type;
121
0
    }
122
9
    system_properties.properties = _scan_params.properties;
123
9
    system_properties.hdfs_params = _scan_params.hdfs_params;
124
9
    if (_scan_params.__isset.broker_addresses) {
125
0
        system_properties.broker_addresses.assign(_scan_params.broker_addresses.begin(),
126
0
                                                  _scan_params.broker_addresses.end());
127
0
    }
128
129
9
    io::FileReaderOptions reader_options =
130
9
            FileFactory::get_reader_options(_state, file_description);
131
9
    auto reader_res = io::DelegateReader::create_file_reader(
132
9
            _profile, system_properties, file_description, reader_options,
133
9
            io::DelegateReader::AccessMode::RANDOM, _io_ctx);
134
9
    if (!reader_res.has_value()) {
135
0
        return reader_res.error();
136
0
    }
137
9
    _file_reader = reader_res.value();
138
139
9
    if (_io_ctx) {
140
0
        _file_reader =
141
0
                std::make_shared<io::TracingFileReader>(_file_reader, _io_ctx->file_reader_stats);
142
0
    }
143
144
9
    RETURN_IF_ERROR(validate_and_consume_header(_file_reader, _scan_range, &_file_size,
145
9
                                                &_current_offset, &_eof));
146
9
    return Status::OK();
147
9
}
148
149
91
Status NativeReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
150
91
    if (_eof) {
151
5
        *read_rows = 0;
152
5
        *eof = true;
153
5
        return Status::OK();
154
5
    }
155
156
86
    RETURN_IF_ERROR(init_reader());
157
158
86
    std::string buff;
159
86
    bool local_eof = false;
160
161
    // If we have already loaded the first block for schema probing, use it first.
162
86
    if (_first_block_loaded && !_first_block_consumed) {
163
0
        buff = _first_block_buf;
164
0
        local_eof = false;
165
86
    } else {
166
86
        RETURN_IF_ERROR(_read_next_pblock(&buff, &local_eof));
167
86
    }
168
169
    // If we reach EOF and also read no data for this call, the whole file is considered finished.
170
86
    if (local_eof && buff.empty()) {
171
1
        *read_rows = 0;
172
1
        *eof = true;
173
1
        _eof = true;
174
1
        return Status::OK();
175
1
    }
176
    // If buffer is empty but we have not reached EOF yet, treat this as an error.
177
85
    if (buff.empty()) {
178
0
        return Status::InternalError("read empty native block from file {}", _scan_range.path);
179
0
    }
180
181
85
    PBlock pblock;
182
85
    if (!pblock.ParseFromArray(buff.data(), static_cast<int>(buff.size()))) {
183
0
        return Status::InternalError("Failed to parse native PBlock from file {}",
184
0
                                     _scan_range.path);
185
0
    }
186
187
    // Initialize schema from first block if not done yet.
188
85
    if (!_schema_inited) {
189
7
        RETURN_IF_ERROR(_init_schema_from_pblock(pblock));
190
7
    }
191
192
85
    size_t uncompressed_bytes = 0;
193
85
    int64_t decompress_time = 0;
194
85
    RETURN_IF_ERROR(block->deserialize(pblock, &uncompressed_bytes, &decompress_time));
195
196
    // For external file scan / TVF scenarios, unify all columns as nullable to match
197
    // GenericReader/SlotDescriptor convention. This ensures schema consistency when
198
    // some writers emit non-nullable columns.
199
635
    for (size_t i = 0; i < block->columns(); ++i) {
200
550
        auto& col_with_type = block->get_by_position(i);
201
550
        if (!col_with_type.type->is_nullable()) {
202
2
            col_with_type.column = make_nullable(col_with_type.column);
203
2
            col_with_type.type = make_nullable(col_with_type.type);
204
2
        }
205
550
    }
206
207
85
    *read_rows = block->rows();
208
85
    *eof = false;
209
210
85
    if (_first_block_loaded && !_first_block_consumed) {
211
0
        _first_block_consumed = true;
212
0
    }
213
214
    // If we reached the physical end of file, mark eof for subsequent calls.
215
85
    if (_current_offset >= _file_size) {
216
7
        _eof = true;
217
7
    }
218
219
85
    return Status::OK();
220
85
}
221
222
Status NativeReader::get_columns(std::unordered_map<std::string, DataTypePtr>* name_to_type,
223
1
                                 std::unordered_set<std::string>* missing_cols) {
224
1
    missing_cols->clear();
225
1
    RETURN_IF_ERROR(init_reader());
226
227
1
    if (!_schema_inited) {
228
        // Load first block lazily to initialize schema.
229
1
        if (!_first_block_loaded) {
230
1
            bool local_eof = false;
231
1
            RETURN_IF_ERROR(_read_next_pblock(&_first_block_buf, &local_eof));
232
            // Treat file as empty only if we reach EOF and there is no block data at all.
233
1
            if (local_eof && _first_block_buf.empty()) {
234
0
                return Status::EndOfFile("empty native file {}", _scan_range.path);
235
0
            }
236
            // Non-EOF but empty buffer means corrupted native file.
237
1
            if (_first_block_buf.empty()) {
238
0
                return Status::InternalError("first native block is empty {}", _scan_range.path);
239
0
            }
240
1
            _first_block_loaded = true;
241
1
        }
242
243
1
        PBlock pblock;
244
1
        if (!pblock.ParseFromArray(_first_block_buf.data(),
245
1
                                   static_cast<int>(_first_block_buf.size()))) {
246
0
            return Status::InternalError("Failed to parse native PBlock for schema from file {}",
247
0
                                         _scan_range.path);
248
0
        }
249
1
        RETURN_IF_ERROR(_init_schema_from_pblock(pblock));
250
1
    }
251
252
7
    for (size_t i = 0; i < _schema_col_names.size(); ++i) {
253
6
        name_to_type->emplace(_schema_col_names[i], _schema_col_types[i]);
254
6
    }
255
1
    return Status::OK();
256
1
}
257
258
0
Status NativeReader::init_schema_reader() {
259
0
    RETURN_IF_ERROR(init_reader());
260
0
    return Status::OK();
261
0
}
262
263
Status NativeReader::get_parsed_schema(std::vector<std::string>* col_names,
264
1
                                       std::vector<DataTypePtr>* col_types) {
265
1
    RETURN_IF_ERROR(init_reader());
266
267
1
    if (!_schema_inited) {
268
0
        if (!_first_block_loaded) {
269
0
            bool local_eof = false;
270
0
            RETURN_IF_ERROR(_read_next_pblock(&_first_block_buf, &local_eof));
271
            // Treat file as empty only if we reach EOF and there is no block data at all.
272
0
            if (local_eof && _first_block_buf.empty()) {
273
0
                return Status::EndOfFile("empty native file {}", _scan_range.path);
274
0
            }
275
            // Non-EOF but empty buffer means corrupted native file.
276
0
            if (_first_block_buf.empty()) {
277
0
                return Status::InternalError("first native block is empty {}", _scan_range.path);
278
0
            }
279
0
            _first_block_loaded = true;
280
0
        }
281
282
0
        PBlock pblock;
283
0
        if (!pblock.ParseFromArray(_first_block_buf.data(),
284
0
                                   static_cast<int>(_first_block_buf.size()))) {
285
0
            return Status::InternalError("Failed to parse native PBlock for schema from file {}",
286
0
                                         _scan_range.path);
287
0
        }
288
0
        RETURN_IF_ERROR(_init_schema_from_pblock(pblock));
289
0
    }
290
291
1
    *col_names = _schema_col_names;
292
1
    *col_types = _schema_col_types;
293
1
    return Status::OK();
294
1
}
295
296
9
Status NativeReader::close() {
297
9
    _file_reader.reset();
298
9
    return Status::OK();
299
9
}
300
301
87
Status NativeReader::_read_next_pblock(std::string* buff, bool* eof) {
302
87
    *eof = false;
303
87
    buff->clear();
304
305
87
    if (_file_reader == nullptr) {
306
0
        RETURN_IF_ERROR(init_reader());
307
0
    }
308
309
87
    if (_current_offset >= _file_size) {
310
1
        *eof = true;
311
1
        return Status::OK();
312
1
    }
313
314
86
    uint64_t len = 0;
315
86
    Slice len_slice(reinterpret_cast<char*>(&len), sizeof(len));
316
86
    size_t bytes_read = 0;
317
86
    RETURN_IF_ERROR(_file_reader->read_at(_current_offset, len_slice, &bytes_read));
318
86
    if (bytes_read == 0) {
319
0
        *eof = true;
320
0
        return Status::OK();
321
0
    }
322
86
    if (bytes_read != sizeof(len)) {
323
0
        return Status::InternalError(
324
0
                "Failed to read native block length from file {}, expect {}, "
325
0
                "actual {}",
326
0
                _scan_range.path, sizeof(len), bytes_read);
327
0
    }
328
329
86
    _current_offset += sizeof(len);
330
86
    if (len == 0) {
331
        // Empty block, nothing to read.
332
0
        *eof = (_current_offset >= _file_size);
333
0
        return Status::OK();
334
0
    }
335
336
86
    buff->assign(len, '\0');
337
86
    Slice data_slice(buff->data(), len);
338
86
    bytes_read = 0;
339
86
    RETURN_IF_ERROR(_file_reader->read_at(_current_offset, data_slice, &bytes_read));
340
86
    if (bytes_read != len) {
341
0
        return Status::InternalError(
342
0
                "Failed to read native block body from file {}, expect {}, "
343
0
                "actual {}",
344
0
                _scan_range.path, len, bytes_read);
345
0
    }
346
347
86
    _current_offset += len;
348
86
    *eof = (_current_offset >= _file_size);
349
86
    return Status::OK();
350
86
}
351
352
8
Status NativeReader::_init_schema_from_pblock(const PBlock& pblock) {
353
8
    _schema_col_names.clear();
354
8
    _schema_col_types.clear();
355
356
88
    for (const auto& pcol_meta : pblock.column_metas()) {
357
88
        DataTypePtr type = make_nullable(DataTypeFactory::instance().create_data_type(pcol_meta));
358
88
        VLOG_DEBUG << "init_schema_from_pblock, name=" << pcol_meta.name()
359
0
                   << ", type=" << type->get_name();
360
88
        _schema_col_names.emplace_back(pcol_meta.name());
361
88
        _schema_col_types.emplace_back(type);
362
88
    }
363
8
    _schema_inited = true;
364
8
    return Status::OK();
365
8
}
366
367
#include "common/compile_check_end.h"
368
369
} // namespace doris