/root/doris/be/src/exec/decompressor.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exec/decompressor.h" |
19 | | |
20 | | #include <strings.h> |
21 | | |
22 | | #include <memory> |
23 | | #include <ostream> |
24 | | |
25 | | #include "common/logging.h" |
26 | | #include "common/status.h" |
27 | | #include "gutil/endian.h" |
28 | | #include "gutil/strings/substitute.h" |
29 | | |
30 | | namespace doris { |
31 | | |
32 | | Status Decompressor::create_decompressor(CompressType type, |
33 | 0 | std::unique_ptr<Decompressor>* decompressor) { |
34 | 0 | switch (type) { |
35 | 0 | case CompressType::UNCOMPRESSED: Branch (35:5): [True: 0, False: 0]
|
36 | 0 | decompressor->reset(nullptr); |
37 | 0 | break; |
38 | 0 | case CompressType::GZIP: Branch (38:5): [True: 0, False: 0]
|
39 | 0 | decompressor->reset(new GzipDecompressor(false)); |
40 | 0 | break; |
41 | 0 | case CompressType::DEFLATE: Branch (41:5): [True: 0, False: 0]
|
42 | 0 | decompressor->reset(new GzipDecompressor(true)); |
43 | 0 | break; |
44 | 0 | case CompressType::BZIP2: Branch (44:5): [True: 0, False: 0]
|
45 | 0 | decompressor->reset(new Bzip2Decompressor()); |
46 | 0 | break; |
47 | 0 | case CompressType::ZSTD: Branch (47:5): [True: 0, False: 0]
|
48 | 0 | decompressor->reset(new ZstdDecompressor()); |
49 | 0 | break; |
50 | 0 | case CompressType::LZ4FRAME: Branch (50:5): [True: 0, False: 0]
|
51 | 0 | decompressor->reset(new Lz4FrameDecompressor()); |
52 | 0 | break; |
53 | 0 | case CompressType::LZ4BLOCK: Branch (53:5): [True: 0, False: 0]
|
54 | 0 | decompressor->reset(new Lz4BlockDecompressor()); |
55 | 0 | break; |
56 | 0 | case CompressType::SNAPPYBLOCK: Branch (56:5): [True: 0, False: 0]
|
57 | 0 | decompressor->reset(new SnappyBlockDecompressor()); |
58 | 0 | break; |
59 | 0 | case CompressType::LZOP: Branch (59:5): [True: 0, False: 0]
|
60 | 0 | decompressor->reset(new LzopDecompressor()); |
61 | 0 | break; |
62 | 0 | default: Branch (62:5): [True: 0, False: 0]
|
63 | 0 | return Status::InternalError("Unknown compress type: {}", type); |
64 | 0 | } |
65 | | |
66 | 0 | Status st = Status::OK(); |
67 | 0 | if (*decompressor != nullptr) { Branch (67:9): [True: 0, False: 0]
|
68 | 0 | st = (*decompressor)->init(); |
69 | 0 | } |
70 | |
|
71 | 0 | return st; |
72 | 0 | } |
73 | | |
74 | | Status Decompressor::create_decompressor(TFileCompressType::type type, |
75 | 0 | std::unique_ptr<Decompressor>* decompressor) { |
76 | 0 | CompressType compress_type; |
77 | 0 | switch (type) { |
78 | 0 | case TFileCompressType::PLAIN: Branch (78:5): [True: 0, False: 0]
|
79 | 0 | case TFileCompressType::UNKNOWN: Branch (79:5): [True: 0, False: 0]
|
80 | 0 | compress_type = CompressType::UNCOMPRESSED; |
81 | 0 | break; |
82 | 0 | case TFileCompressType::GZ: Branch (82:5): [True: 0, False: 0]
|
83 | 0 | compress_type = CompressType::GZIP; |
84 | 0 | break; |
85 | 0 | case TFileCompressType::LZO: Branch (85:5): [True: 0, False: 0]
|
86 | 0 | case TFileCompressType::LZOP: Branch (86:5): [True: 0, False: 0]
|
87 | 0 | compress_type = CompressType::LZOP; |
88 | 0 | break; |
89 | 0 | case TFileCompressType::BZ2: Branch (89:5): [True: 0, False: 0]
|
90 | 0 | compress_type = CompressType::BZIP2; |
91 | 0 | break; |
92 | 0 | case TFileCompressType::ZSTD: Branch (92:5): [True: 0, False: 0]
|
93 | 0 | compress_type = CompressType::ZSTD; |
94 | 0 | break; |
95 | 0 | case TFileCompressType::LZ4FRAME: Branch (95:5): [True: 0, False: 0]
|
96 | 0 | compress_type = CompressType::LZ4FRAME; |
97 | 0 | break; |
98 | 0 | case TFileCompressType::LZ4BLOCK: Branch (98:5): [True: 0, False: 0]
|
99 | 0 | compress_type = CompressType::LZ4BLOCK; |
100 | 0 | break; |
101 | 0 | case TFileCompressType::DEFLATE: Branch (101:5): [True: 0, False: 0]
|
102 | 0 | compress_type = CompressType::DEFLATE; |
103 | 0 | break; |
104 | 0 | case TFileCompressType::SNAPPYBLOCK: Branch (104:5): [True: 0, False: 0]
|
105 | 0 | compress_type = CompressType::SNAPPYBLOCK; |
106 | 0 | break; |
107 | 0 | default: Branch (107:5): [True: 0, False: 0]
|
108 | 0 | return Status::InternalError<false>("unknown compress type: {}", type); |
109 | 0 | } |
110 | 0 | RETURN_IF_ERROR(Decompressor::create_decompressor(compress_type, decompressor)); Line | Count | Source | 637 | 0 | do { \ | 638 | 0 | Status _status_ = (stmt); \ | 639 | 0 | if (UNLIKELY(!_status_.ok())) { \Line | Count | Source | 36 | 0 | #define UNLIKELY(expr) __builtin_expect(!!(expr), 0) Branch (36:24): [True: 0, False: 0]
|
| 640 | 0 | return _status_; \ | 641 | 0 | } \ | 642 | 0 | } while (false) Branch (642:14): [Folded - Ignored]
|
|
111 | | |
112 | 0 | return Status::OK(); |
113 | 0 | } |
114 | | |
115 | | Status Decompressor::create_decompressor(TFileFormatType::type type, |
116 | 0 | std::unique_ptr<Decompressor>* decompressor) { |
117 | 0 | CompressType compress_type; |
118 | 0 | switch (type) { |
119 | 0 | case TFileFormatType::FORMAT_PROTO: Branch (119:5): [True: 0, False: 0]
|
120 | 0 | [[fallthrough]]; |
121 | 0 | case TFileFormatType::FORMAT_CSV_PLAIN: Branch (121:5): [True: 0, False: 0]
|
122 | 0 | compress_type = CompressType::UNCOMPRESSED; |
123 | 0 | break; |
124 | 0 | case TFileFormatType::FORMAT_CSV_GZ: Branch (124:5): [True: 0, False: 0]
|
125 | 0 | compress_type = CompressType::GZIP; |
126 | 0 | break; |
127 | 0 | case TFileFormatType::FORMAT_CSV_BZ2: Branch (127:5): [True: 0, False: 0]
|
128 | 0 | compress_type = CompressType::BZIP2; |
129 | 0 | break; |
130 | 0 | case TFileFormatType::FORMAT_CSV_LZ4FRAME: Branch (130:5): [True: 0, False: 0]
|
131 | 0 | compress_type = CompressType::LZ4FRAME; |
132 | 0 | break; |
133 | 0 | case TFileFormatType::FORMAT_CSV_LZ4BLOCK: Branch (133:5): [True: 0, False: 0]
|
134 | 0 | compress_type = CompressType::LZ4BLOCK; |
135 | 0 | break; |
136 | 0 | case TFileFormatType::FORMAT_CSV_LZOP: Branch (136:5): [True: 0, False: 0]
|
137 | 0 | compress_type = CompressType::LZOP; |
138 | 0 | break; |
139 | 0 | case TFileFormatType::FORMAT_CSV_DEFLATE: Branch (139:5): [True: 0, False: 0]
|
140 | 0 | compress_type = CompressType::DEFLATE; |
141 | 0 | break; |
142 | 0 | case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: Branch (142:5): [True: 0, False: 0]
|
143 | 0 | compress_type = CompressType::SNAPPYBLOCK; |
144 | 0 | break; |
145 | 0 | default: Branch (145:5): [True: 0, False: 0]
|
146 | 0 | return Status::InternalError<false>("unknown compress type: {}", type); |
147 | 0 | } |
148 | 0 | RETURN_IF_ERROR(Decompressor::create_decompressor(compress_type, decompressor)); Line | Count | Source | 637 | 0 | do { \ | 638 | 0 | Status _status_ = (stmt); \ | 639 | 0 | if (UNLIKELY(!_status_.ok())) { \Line | Count | Source | 36 | 0 | #define UNLIKELY(expr) __builtin_expect(!!(expr), 0) Branch (36:24): [True: 0, False: 0]
|
| 640 | 0 | return _status_; \ | 641 | 0 | } \ | 642 | 0 | } while (false) Branch (642:14): [Folded - Ignored]
|
|
149 | | |
150 | 0 | return Status::OK(); |
151 | 0 | } |
152 | | |
153 | 0 | uint32_t Decompressor::_read_int32(uint8_t* buf) { |
154 | 0 | return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; |
155 | 0 | } |
156 | | |
157 | 0 | std::string Decompressor::debug_info() { |
158 | 0 | return "Decompressor"; |
159 | 0 | } |
160 | | |
161 | | // Gzip |
162 | | GzipDecompressor::GzipDecompressor(bool is_deflate) |
163 | | : Decompressor(is_deflate ? CompressType::DEFLATE : CompressType::GZIP), |
164 | 0 | _is_deflate(is_deflate) {} |
165 | | |
166 | 0 | GzipDecompressor::~GzipDecompressor() { |
167 | 0 | (void)inflateEnd(&_z_strm); |
168 | 0 | } |
169 | | |
170 | 0 | Status GzipDecompressor::init() { |
171 | 0 | _z_strm = {}; |
172 | 0 | _z_strm.zalloc = Z_NULL; |
173 | 0 | _z_strm.zfree = Z_NULL; |
174 | 0 | _z_strm.opaque = Z_NULL; |
175 | |
|
176 | 0 | int window_bits = _is_deflate ? WINDOW_BITS : (WINDOW_BITS | DETECT_CODEC); Branch (176:23): [True: 0, False: 0]
|
177 | 0 | int ret = inflateInit2(&_z_strm, window_bits); |
178 | 0 | if (ret < 0) { Branch (178:9): [True: 0, False: 0]
|
179 | 0 | return Status::InternalError("Failed to do gzip decompress. status code: {}", ret); |
180 | 0 | } |
181 | | |
182 | 0 | return Status::OK(); |
183 | 0 | } |
184 | | |
185 | | Status GzipDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, |
186 | | uint8_t* output, size_t output_max_len, |
187 | | size_t* decompressed_len, bool* stream_end, |
188 | 0 | size_t* more_input_bytes, size_t* more_output_bytes) { |
189 | | // 1. set input and output |
190 | 0 | _z_strm.next_in = input; |
191 | 0 | _z_strm.avail_in = input_len; |
192 | 0 | _z_strm.next_out = output; |
193 | 0 | _z_strm.avail_out = output_max_len; |
194 | |
|
195 | 0 | while (_z_strm.avail_out > 0 && _z_strm.avail_in > 0) { Branch (195:12): [True: 0, False: 0]
Branch (195:37): [True: 0, False: 0]
|
196 | 0 | *stream_end = false; |
197 | | // inflate() performs one or both of the following actions: |
198 | | // Decompress more input starting at next_in and update next_in and avail_in |
199 | | // accordingly. |
200 | | // Provide more output starting at next_out and update next_out and avail_out |
201 | | // accordingly. |
202 | | // inflate() returns Z_OK if some progress has been made (more input processed |
203 | | // or more output produced) |
204 | |
|
205 | 0 | int ret = inflate(&_z_strm, Z_NO_FLUSH); |
206 | 0 | *input_bytes_read = input_len - _z_strm.avail_in; |
207 | 0 | *decompressed_len = output_max_len - _z_strm.avail_out; |
208 | |
|
209 | 0 | VLOG_TRACE << "gzip dec ret: " << ret << " input_bytes_read: " << *input_bytes_read Line | Count | Source | 40 | 0 | #define VLOG_TRACE VLOG(10) |
|
210 | 0 | << " decompressed_len: " << *decompressed_len; |
211 | |
|
212 | 0 | if (ret == Z_BUF_ERROR) { Branch (212:13): [True: 0, False: 0]
|
213 | | // Z_BUF_ERROR indicates that inflate() could not consume more input or |
214 | | // produce more output. inflate() can be called again with more output space |
215 | | // or more available input |
216 | | // ATTN: even if ret == Z_OK, decompressed_len may also be zero |
217 | 0 | return Status::OK(); |
218 | 0 | } else if (ret == Z_STREAM_END) { Branch (218:20): [True: 0, False: 0]
|
219 | 0 | *stream_end = true; |
220 | | // reset _z_strm to continue decoding a subsequent gzip stream |
221 | 0 | ret = inflateReset(&_z_strm); |
222 | 0 | if (ret != Z_OK) { Branch (222:17): [True: 0, False: 0]
|
223 | 0 | if (_is_deflate) { Branch (223:21): [True: 0, False: 0]
|
224 | 0 | return Status::InternalError("Failed to do deflate decompress. return code: {}", |
225 | 0 | ret); |
226 | 0 | } else { |
227 | 0 | return Status::InternalError("Failed to do gzip decompress. return code: {}", |
228 | 0 | ret); |
229 | 0 | } |
230 | 0 | } |
231 | 0 | } else if (ret != Z_OK) { Branch (231:20): [True: 0, False: 0]
|
232 | 0 | if (_is_deflate) { Branch (232:17): [True: 0, False: 0]
|
233 | 0 | return Status::InternalError("Failed to do deflate decompress. return code: {}", |
234 | 0 | ret); |
235 | 0 | } else { |
236 | 0 | return Status::InternalError("Failed to do gzip decompress. return code: {}", ret); |
237 | 0 | } |
238 | 0 | } else { |
239 | | // here ret must be Z_OK. |
240 | | // we continue if avail_out and avail_in > 0. |
241 | | // this means 'inflate' is not done yet. |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | 0 | return Status::OK(); |
246 | 0 | } |
247 | | |
248 | 0 | std::string GzipDecompressor::debug_info() { |
249 | 0 | std::stringstream ss; |
250 | 0 | ss << "GzipDecompressor." |
251 | 0 | << " is_deflate: " << _is_deflate; |
252 | 0 | return ss.str(); |
253 | 0 | } |
254 | | |
255 | | // Bzip2 |
256 | 0 | Bzip2Decompressor::~Bzip2Decompressor() { |
257 | 0 | BZ2_bzDecompressEnd(&_bz_strm); |
258 | 0 | } |
259 | | |
260 | 0 | Status Bzip2Decompressor::init() { |
261 | 0 | bzero(&_bz_strm, sizeof(_bz_strm)); |
262 | 0 | int ret = BZ2_bzDecompressInit(&_bz_strm, 0, 0); |
263 | 0 | if (ret != BZ_OK) { Branch (263:9): [True: 0, False: 0]
|
264 | 0 | return Status::InternalError("Failed to do bz2 decompress. status code: {}", ret); |
265 | 0 | } |
266 | | |
267 | 0 | return Status::OK(); |
268 | 0 | } |
269 | | |
270 | | Status Bzip2Decompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, |
271 | | uint8_t* output, size_t output_max_len, |
272 | | size_t* decompressed_len, bool* stream_end, |
273 | 0 | size_t* more_input_bytes, size_t* more_output_bytes) { |
274 | | // 1. set input and output |
275 | 0 | _bz_strm.next_in = const_cast<char*>(reinterpret_cast<const char*>(input)); |
276 | 0 | _bz_strm.avail_in = input_len; |
277 | 0 | _bz_strm.next_out = reinterpret_cast<char*>(output); |
278 | 0 | _bz_strm.avail_out = output_max_len; |
279 | |
|
280 | 0 | while (_bz_strm.avail_out > 0 && _bz_strm.avail_in > 0) { Branch (280:12): [True: 0, False: 0]
Branch (280:38): [True: 0, False: 0]
|
281 | 0 | *stream_end = false; |
282 | | // decompress |
283 | 0 | int ret = BZ2_bzDecompress(&_bz_strm); |
284 | 0 | *input_bytes_read = input_len - _bz_strm.avail_in; |
285 | 0 | *decompressed_len = output_max_len - _bz_strm.avail_out; |
286 | |
|
287 | 0 | if (ret == BZ_DATA_ERROR || ret == BZ_DATA_ERROR_MAGIC) { Branch (287:13): [True: 0, False: 0]
Branch (287:37): [True: 0, False: 0]
|
288 | 0 | LOG(INFO) << "input_bytes_read: " << *input_bytes_read |
289 | 0 | << " decompressed_len: " << *decompressed_len; |
290 | 0 | return Status::InternalError("Failed to do bz2 decompress. status code: {}", ret); |
291 | 0 | } else if (ret == BZ_STREAM_END) { Branch (291:20): [True: 0, False: 0]
|
292 | 0 | *stream_end = true; |
293 | 0 | ret = BZ2_bzDecompressEnd(&_bz_strm); |
294 | 0 | if (ret != BZ_OK) { Branch (294:17): [True: 0, False: 0]
|
295 | 0 | return Status::InternalError("Failed to do bz2 decompress. status code: {}", ret); |
296 | 0 | } |
297 | | |
298 | 0 | ret = BZ2_bzDecompressInit(&_bz_strm, 0, 0); |
299 | 0 | if (ret != BZ_OK) { Branch (299:17): [True: 0, False: 0]
|
300 | 0 | return Status::InternalError("Failed to do bz2 decompress. status code: {}", ret); |
301 | 0 | } |
302 | 0 | } else if (ret != BZ_OK) { Branch (302:20): [True: 0, False: 0]
|
303 | 0 | return Status::InternalError("Failed to bz2 decompress. status code: {}", ret); |
304 | 0 | } else { |
305 | | // continue |
306 | 0 | } |
307 | 0 | } |
308 | | |
309 | 0 | return Status::OK(); |
310 | 0 | } |
311 | | |
312 | 0 | std::string Bzip2Decompressor::debug_info() { |
313 | 0 | std::stringstream ss; |
314 | 0 | ss << "Bzip2Decompressor."; |
315 | 0 | return ss.str(); |
316 | 0 | } |
317 | | |
318 | 0 | ZstdDecompressor::~ZstdDecompressor() { |
319 | 0 | ZSTD_freeDStream(_zstd_strm); |
320 | 0 | } |
321 | | |
322 | 0 | Status ZstdDecompressor::init() { |
323 | 0 | _zstd_strm = ZSTD_createDStream(); |
324 | 0 | if (!_zstd_strm) { Branch (324:9): [True: 0, False: 0]
|
325 | 0 | std::stringstream ss; |
326 | 0 | return Status::InternalError("ZSTD_dctx creation error"); |
327 | 0 | } |
328 | 0 | auto ret = ZSTD_initDStream(_zstd_strm); |
329 | 0 | if (ZSTD_isError(ret)) { Branch (329:9): [True: 0, False: 0]
|
330 | 0 | return Status::InternalError("ZSTD_initDStream error: {}", ZSTD_getErrorName(ret)); |
331 | 0 | } |
332 | 0 | return Status::OK(); |
333 | 0 | } |
334 | | |
335 | | Status ZstdDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, |
336 | | uint8_t* output, size_t output_max_len, |
337 | | size_t* decompressed_len, bool* stream_end, |
338 | 0 | size_t* more_input_bytes, size_t* more_output_bytes) { |
339 | | // 1. set input and output |
340 | 0 | ZSTD_inBuffer inputBuffer = {input, input_len, 0}; |
341 | 0 | ZSTD_outBuffer outputBuffer = {output, output_max_len, 0}; |
342 | | |
343 | | // decompress |
344 | 0 | int ret = ZSTD_decompressStream(_zstd_strm, &outputBuffer, &inputBuffer); |
345 | 0 | *input_bytes_read = inputBuffer.pos; |
346 | 0 | *decompressed_len = outputBuffer.pos; |
347 | |
|
348 | 0 | if (ZSTD_isError(ret)) { Branch (348:9): [True: 0, False: 0]
|
349 | 0 | return Status::InternalError("Failed to do zstd decompress: {}", ZSTD_getErrorName(ret)); |
350 | 0 | } |
351 | | |
352 | 0 | *stream_end = ret == 0; |
353 | 0 | return Status::OK(); |
354 | 0 | } |
355 | | |
356 | 0 | std::string ZstdDecompressor::debug_info() { |
357 | 0 | std::stringstream ss; |
358 | 0 | ss << "ZstdDecompressor."; |
359 | 0 | return ss.str(); |
360 | 0 | } |
361 | | |
362 | | // Lz4Frame |
363 | | // Lz4 version: 1.7.5 |
364 | | // define LZ4F_VERSION = 100 |
365 | | const unsigned Lz4FrameDecompressor::DORIS_LZ4F_VERSION = 100; |
366 | | |
367 | 0 | Lz4FrameDecompressor::~Lz4FrameDecompressor() { |
368 | 0 | LZ4F_freeDecompressionContext(_dctx); |
369 | 0 | } |
370 | | |
371 | 0 | Status Lz4FrameDecompressor::init() { |
372 | 0 | size_t ret = LZ4F_createDecompressionContext(&_dctx, DORIS_LZ4F_VERSION); |
373 | 0 | if (LZ4F_isError(ret)) { Branch (373:9): [True: 0, False: 0]
|
374 | 0 | std::stringstream ss; |
375 | 0 | ss << "LZ4F_dctx creation error: " << std::string(LZ4F_getErrorName(ret)); |
376 | 0 | return Status::InternalError(ss.str()); |
377 | 0 | } |
378 | | |
379 | | // init as -1 |
380 | 0 | _expect_dec_buf_size = -1; |
381 | |
|
382 | 0 | return Status::OK(); |
383 | 0 | } |
384 | | |
385 | | Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, |
386 | | uint8_t* output, size_t output_max_len, |
387 | | size_t* decompressed_len, bool* stream_end, |
388 | 0 | size_t* more_input_bytes, size_t* more_output_bytes) { |
389 | 0 | uint8_t* src = input; |
390 | 0 | size_t remaining_input_size = input_len; |
391 | 0 | size_t ret = 1; |
392 | 0 | *input_bytes_read = 0; |
393 | |
|
394 | 0 | if (_expect_dec_buf_size == -1) { Branch (394:9): [True: 0, False: 0]
|
395 | | // init expected decompress buf size, and check if output_max_len is large enough |
396 | | // ATTN: _expect_dec_buf_size is uninit, which means this is the first time to call |
397 | | // decompress(), so *input* should point to the head of the compressed file, |
398 | | // where lz4 header section is there. |
399 | |
|
400 | 0 | if (input_len < 15) { Branch (400:13): [True: 0, False: 0]
|
401 | 0 | return Status::InternalError( |
402 | 0 | "Lz4 header size is between 7 and 15 bytes. " |
403 | 0 | "but input size is only: {}", |
404 | 0 | input_len); |
405 | 0 | } |
406 | | |
407 | 0 | LZ4F_frameInfo_t info; |
408 | 0 | ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &remaining_input_size); |
409 | 0 | if (LZ4F_isError(ret)) { Branch (409:13): [True: 0, False: 0]
|
410 | 0 | return Status::InternalError("LZ4F_getFrameInfo error: {}", |
411 | 0 | std::string(LZ4F_getErrorName(ret))); |
412 | 0 | } |
413 | | |
414 | 0 | _expect_dec_buf_size = get_block_size(&info); |
415 | 0 | if (_expect_dec_buf_size == -1) { Branch (415:13): [True: 0, False: 0]
|
416 | 0 | return Status::InternalError( |
417 | 0 | "Impossible lz4 block size unless more block sizes are allowed {}", |
418 | 0 | std::string(LZ4F_getErrorName(ret))); |
419 | 0 | } |
420 | | |
421 | 0 | *input_bytes_read = remaining_input_size; |
422 | |
|
423 | 0 | src += remaining_input_size; |
424 | 0 | remaining_input_size = input_len - remaining_input_size; |
425 | |
|
426 | 0 | LOG(INFO) << "lz4 block size: " << _expect_dec_buf_size; |
427 | 0 | } |
428 | | |
429 | | // decompress |
430 | 0 | size_t output_len = output_max_len; |
431 | 0 | ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &remaining_input_size, |
432 | 0 | /* LZ4F_decompressOptions_t */ nullptr); |
433 | 0 | if (LZ4F_isError(ret)) { Branch (433:9): [True: 0, False: 0]
|
434 | 0 | return Status::InternalError("Decompression error: {}", |
435 | 0 | std::string(LZ4F_getErrorName(ret))); |
436 | 0 | } |
437 | | |
438 | | // update |
439 | 0 | *input_bytes_read += remaining_input_size; |
440 | 0 | *decompressed_len = output_len; |
441 | 0 | if (ret == 0) { Branch (441:9): [True: 0, False: 0]
|
442 | 0 | *stream_end = true; |
443 | 0 | } else { |
444 | 0 | *stream_end = false; |
445 | 0 | } |
446 | |
|
447 | 0 | return Status::OK(); |
448 | 0 | } |
449 | | |
450 | 0 | std::string Lz4FrameDecompressor::debug_info() { |
451 | 0 | std::stringstream ss; |
452 | 0 | ss << "Lz4FrameDecompressor." |
453 | 0 | << " expect dec buf size: " << _expect_dec_buf_size |
454 | 0 | << " Lz4 Frame Version: " << DORIS_LZ4F_VERSION; |
455 | 0 | return ss.str(); |
456 | 0 | } |
457 | | |
458 | 0 | size_t Lz4FrameDecompressor::get_block_size(const LZ4F_frameInfo_t* info) { |
459 | 0 | switch (info->blockSizeID) { |
460 | 0 | case LZ4F_default: Branch (460:5): [True: 0, False: 0]
|
461 | 0 | case LZ4F_max64KB: Branch (461:5): [True: 0, False: 0]
|
462 | 0 | return 1 << 16; |
463 | 0 | case LZ4F_max256KB: Branch (463:5): [True: 0, False: 0]
|
464 | 0 | return 1 << 18; |
465 | 0 | case LZ4F_max1MB: Branch (465:5): [True: 0, False: 0]
|
466 | 0 | return 1 << 20; |
467 | 0 | case LZ4F_max4MB: Branch (467:5): [True: 0, False: 0]
|
468 | 0 | return 1 << 22; |
469 | 0 | default: Branch (469:5): [True: 0, False: 0]
|
470 | | // error |
471 | 0 | return -1; |
472 | 0 | } |
473 | 0 | } |
474 | | |
475 | | /// Lz4BlockDecompressor |
476 | 0 | Status Lz4BlockDecompressor::init() { |
477 | 0 | return Status::OK(); |
478 | 0 | } |
479 | | |
480 | | // Hadoop lz4codec source : |
481 | | // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc |
482 | | // Example: |
483 | | // OriginData(The original data will be divided into several large data block.) : |
484 | | // large data block1 | large data block2 | large data block3 | .... |
485 | | // The large data block will be divided into several small data block. |
486 | | // Suppose a large data block is divided into three small blocks: |
487 | | // large data block1: | small block1 | small block2 | small block3 | |
488 | | // CompressData: <A [B1 compress(small block1) ] [B2 compress(small block1) ] [B3 compress(small block1)]> |
489 | | // |
490 | | // A : original length of the current block of large data block. |
491 | | // sizeof(A) = 4 bytes. |
492 | | // A = length(small block1) + length(small block2) + length(small block3) |
493 | | // Bx : length of small data block bx. |
494 | | // sizeof(Bx) = 4 bytes. |
495 | | // Bx = length(compress(small blockx)) |
496 | | Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, |
497 | | uint8_t* output, size_t output_max_len, |
498 | | size_t* decompressed_len, bool* stream_end, |
499 | 0 | size_t* more_input_bytes, size_t* more_output_bytes) { |
500 | 0 | auto* input_ptr = input; |
501 | 0 | auto* output_ptr = output; |
502 | |
|
503 | 0 | while (input_len > 0) { Branch (503:12): [True: 0, False: 0]
|
504 | 0 | if (input_len < sizeof(uint32_t)) { Branch (504:13): [True: 0, False: 0]
|
505 | 0 | *more_input_bytes = sizeof(uint32_t) - input_len; |
506 | 0 | break; |
507 | 0 | } |
508 | | |
509 | | //if faild, fall back to large block begin |
510 | 0 | auto* large_block_input_ptr = input_ptr; |
511 | 0 | auto* large_block_output_ptr = output_ptr; |
512 | |
|
513 | 0 | uint32_t remaining_decompressed_large_block_len = BigEndian::Load32(input_ptr); |
514 | |
|
515 | 0 | input_ptr += sizeof(uint32_t); |
516 | 0 | input_len -= sizeof(uint32_t); |
517 | |
|
518 | 0 | std::size_t remaining_output_len = output_max_len - *decompressed_len; |
519 | |
|
520 | 0 | if (remaining_output_len < remaining_decompressed_large_block_len) { Branch (520:13): [True: 0, False: 0]
|
521 | | // Need more output buffer |
522 | 0 | *more_output_bytes = remaining_decompressed_large_block_len - remaining_output_len; |
523 | 0 | input_ptr = large_block_input_ptr; |
524 | 0 | output_ptr = large_block_output_ptr; |
525 | |
|
526 | 0 | break; |
527 | 0 | } |
528 | | |
529 | 0 | std::size_t decompressed_large_block_len = 0; |
530 | 0 | while (remaining_decompressed_large_block_len > 0) { Branch (530:16): [True: 0, False: 0]
|
531 | | // Check that input length should not be negative. |
532 | 0 | if (input_len < sizeof(uint32_t)) { Branch (532:17): [True: 0, False: 0]
|
533 | 0 | *more_input_bytes = sizeof(uint32_t) - input_len; |
534 | 0 | break; |
535 | 0 | } |
536 | | |
537 | | // Read the length of the next lz4 compressed block. |
538 | 0 | size_t compressed_small_block_len = BigEndian::Load32(input_ptr); |
539 | |
|
540 | 0 | input_ptr += sizeof(uint32_t); |
541 | 0 | input_len -= sizeof(uint32_t); |
542 | |
|
543 | 0 | if (compressed_small_block_len == 0) { Branch (543:17): [True: 0, False: 0]
|
544 | 0 | continue; |
545 | 0 | } |
546 | | |
547 | 0 | if (compressed_small_block_len > input_len) { Branch (547:17): [True: 0, False: 0]
|
548 | | // Need more input buffer |
549 | 0 | *more_input_bytes = compressed_small_block_len - input_len; |
550 | 0 | break; |
551 | 0 | } |
552 | | |
553 | | // Decompress this block. |
554 | 0 | auto decompressed_small_block_len = LZ4_decompress_safe( |
555 | 0 | reinterpret_cast<const char*>(input_ptr), reinterpret_cast<char*>(output_ptr), |
556 | 0 | compressed_small_block_len, remaining_output_len); |
557 | 0 | if (decompressed_small_block_len < 0) { Branch (557:17): [True: 0, False: 0]
|
558 | 0 | return Status::InvalidArgument("Failed to do Lz4Block decompress, error = {}", |
559 | 0 | LZ4F_getErrorName(decompressed_small_block_len)); |
560 | 0 | } |
561 | 0 | input_ptr += compressed_small_block_len; |
562 | 0 | input_len -= compressed_small_block_len; |
563 | |
|
564 | 0 | output_ptr += decompressed_small_block_len; |
565 | 0 | remaining_decompressed_large_block_len -= decompressed_small_block_len; |
566 | 0 | decompressed_large_block_len += decompressed_small_block_len; |
567 | 0 | }; |
568 | |
|
569 | 0 | if (*more_input_bytes != 0) { Branch (569:13): [True: 0, False: 0]
|
570 | | // Need more input buffer |
571 | 0 | input_ptr = large_block_input_ptr; |
572 | 0 | output_ptr = large_block_output_ptr; |
573 | 0 | break; |
574 | 0 | } |
575 | | |
576 | 0 | *decompressed_len += decompressed_large_block_len; |
577 | 0 | } |
578 | 0 | *input_bytes_read += (input_ptr - input); |
579 | | // If no more input and output need, means this is the end of a compressed block |
580 | 0 | *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); Branch (580:20): [True: 0, False: 0]
Branch (580:46): [True: 0, False: 0]
|
581 | |
|
582 | 0 | return Status::OK(); |
583 | 0 | } |
584 | | |
585 | 0 | std::string Lz4BlockDecompressor::debug_info() { |
586 | 0 | std::stringstream ss; |
587 | 0 | ss << "Lz4BlockDecompressor."; |
588 | 0 | return ss.str(); |
589 | 0 | } |
590 | | |
591 | | /// SnappyBlockDecompressor |
592 | 0 | Status SnappyBlockDecompressor::init() { |
593 | 0 | return Status::OK(); |
594 | 0 | } |
595 | | |
596 | | // Hadoop snappycodec source : |
597 | | // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/SnappyCodec.cc |
598 | | // Example: |
599 | | // OriginData(The original data will be divided into several large data block.) : |
600 | | // large data block1 | large data block2 | large data block3 | .... |
601 | | // The large data block will be divided into several small data block. |
602 | | // Suppose a large data block is divided into three small blocks: |
603 | | // large data block1: | small block1 | small block2 | small block3 | |
604 | | // CompressData: <A [B1 compress(small block1) ] [B2 compress(small block1) ] [B3 compress(small block1)]> |
605 | | // |
606 | | // A : original length of the current block of large data block. |
607 | | // sizeof(A) = 4 bytes. |
608 | | // A = length(small block1) + length(small block2) + length(small block3) |
609 | | // Bx : length of small data block bx. |
610 | | // sizeof(Bx) = 4 bytes. |
611 | | // Bx = length(compress(small blockx)) |
612 | | Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len, |
613 | | size_t* input_bytes_read, uint8_t* output, |
614 | | size_t output_max_len, size_t* decompressed_len, |
615 | | bool* stream_end, size_t* more_input_bytes, |
616 | 0 | size_t* more_output_bytes) { |
617 | 0 | auto* input_ptr = input; |
618 | 0 | auto* output_ptr = output; |
619 | |
|
620 | 0 | while (input_len > 0) { Branch (620:12): [True: 0, False: 0]
|
621 | 0 | if (input_len < sizeof(uint32_t)) { Branch (621:13): [True: 0, False: 0]
|
622 | 0 | *more_input_bytes = sizeof(uint32_t) - input_len; |
623 | 0 | break; |
624 | 0 | } |
625 | | |
626 | | //if faild, fall back to large block begin |
627 | 0 | auto* large_block_input_ptr = input_ptr; |
628 | 0 | auto* large_block_output_ptr = output_ptr; |
629 | |
|
630 | 0 | uint32_t remaining_decompressed_large_block_len = BigEndian::Load32(input_ptr); |
631 | |
|
632 | 0 | input_ptr += sizeof(uint32_t); |
633 | 0 | input_len -= sizeof(uint32_t); |
634 | |
|
635 | 0 | std::size_t remaining_output_len = output_max_len - *decompressed_len; |
636 | |
|
637 | 0 | if (remaining_output_len < remaining_decompressed_large_block_len) { Branch (637:13): [True: 0, False: 0]
|
638 | | // Need more output buffer |
639 | 0 | *more_output_bytes = remaining_decompressed_large_block_len - remaining_output_len; |
640 | 0 | input_ptr = large_block_input_ptr; |
641 | 0 | output_ptr = large_block_output_ptr; |
642 | |
|
643 | 0 | break; |
644 | 0 | } |
645 | | |
646 | 0 | std::size_t decompressed_large_block_len = 0; |
647 | 0 | while (remaining_decompressed_large_block_len > 0) { Branch (647:16): [True: 0, False: 0]
|
648 | | // Check that input length should not be negative. |
649 | 0 | if (input_len < sizeof(uint32_t)) { Branch (649:17): [True: 0, False: 0]
|
650 | 0 | *more_input_bytes = sizeof(uint32_t) - input_len; |
651 | 0 | break; |
652 | 0 | } |
653 | | |
654 | | // Read the length of the next snappy compressed block. |
655 | 0 | size_t compressed_small_block_len = BigEndian::Load32(input_ptr); |
656 | |
|
657 | 0 | input_ptr += sizeof(uint32_t); |
658 | 0 | input_len -= sizeof(uint32_t); |
659 | |
|
660 | 0 | if (compressed_small_block_len == 0) { Branch (660:17): [True: 0, False: 0]
|
661 | 0 | continue; |
662 | 0 | } |
663 | | |
664 | 0 | if (compressed_small_block_len > input_len) { Branch (664:17): [True: 0, False: 0]
|
665 | | // Need more input buffer |
666 | 0 | *more_input_bytes = compressed_small_block_len - input_len; |
667 | 0 | break; |
668 | 0 | } |
669 | | |
670 | | // Decompress this block. |
671 | 0 | size_t decompressed_small_block_len; |
672 | 0 | if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(input_ptr), Branch (672:17): [True: 0, False: 0]
|
673 | 0 | compressed_small_block_len, |
674 | 0 | &decompressed_small_block_len)) { |
675 | 0 | return Status::InternalError("Failed to do snappy decompress."); |
676 | 0 | } |
677 | 0 | if (!snappy::RawUncompress(reinterpret_cast<const char*>(input_ptr), Branch (677:17): [True: 0, False: 0]
|
678 | 0 | compressed_small_block_len, |
679 | 0 | reinterpret_cast<char*>(output_ptr))) { |
680 | 0 | return Status::InternalError( |
681 | 0 | "Failed to do snappy decompress. uncompressed_len: {}, compressed_len: {}", |
682 | 0 | decompressed_small_block_len, compressed_small_block_len); |
683 | 0 | } |
684 | 0 | input_ptr += compressed_small_block_len; |
685 | 0 | input_len -= compressed_small_block_len; |
686 | |
|
687 | 0 | output_ptr += decompressed_small_block_len; |
688 | 0 | remaining_decompressed_large_block_len -= decompressed_small_block_len; |
689 | 0 | decompressed_large_block_len += decompressed_small_block_len; |
690 | 0 | }; |
691 | |
|
692 | 0 | if (*more_input_bytes != 0) { Branch (692:13): [True: 0, False: 0]
|
693 | | // Need more input buffer |
694 | 0 | input_ptr = large_block_input_ptr; |
695 | 0 | output_ptr = large_block_output_ptr; |
696 | 0 | break; |
697 | 0 | } |
698 | | |
699 | 0 | *decompressed_len += decompressed_large_block_len; |
700 | 0 | } |
701 | 0 | *input_bytes_read += (input_ptr - input); |
702 | | // If no more input and output need, means this is the end of a compressed block |
703 | 0 | *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); Branch (703:20): [True: 0, False: 0]
Branch (703:46): [True: 0, False: 0]
|
704 | |
|
705 | 0 | return Status::OK(); |
706 | 0 | } |
707 | | |
708 | 0 | std::string SnappyBlockDecompressor::debug_info() { |
709 | 0 | std::stringstream ss; |
710 | 0 | ss << "SnappyBlockDecompressor."; |
711 | 0 | return ss.str(); |
712 | 0 | } |
713 | | |
714 | | } // namespace doris |