Coverage Report

Created: 2026-03-13 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_compress.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
#include <glog/logging.h>
18
19
#include <array>
20
#include <cctype>
21
#include <cstddef>
22
#include <cstring>
23
#include <functional>
24
#include <memory>
25
#include <string>
26
#include <utility>
27
28
#include "common/status.h"
29
#include "core/assert_cast.h"
30
#include "core/block/block.h"
31
#include "core/block/column_numbers.h"
32
#include "core/block/column_with_type_and_name.h"
33
#include "core/column/column.h"
34
#include "core/column/column_nullable.h"
35
#include "core/column/column_string.h"
36
#include "core/column/column_vector.h"
37
#include "core/data_type/data_type.h"
38
#include "core/data_type/data_type_nullable.h"
39
#include "core/data_type/data_type_number.h"
40
#include "core/data_type/data_type_string.h"
41
#include "core/types.h"
42
#include "exprs/aggregate/aggregate_function.h"
43
#include "exprs/function/function.h"
44
#include "exprs/function/simple_function_factory.h"
45
#include "util/block_compression.h"
46
#include "util/faststring.h"
47
48
namespace doris {
49
class FunctionContext;
50
} // namespace doris
51
52
namespace doris {
53
#include "common/compile_check_begin.h"
54
static constexpr int COMPRESS_STR_LENGTH = 4;
55
56
class FunctionCompress : public IFunction {
57
public:
58
    static constexpr auto name = "compress";
59
28
    static FunctionPtr create() { return std::make_shared<FunctionCompress>(); }
60
61
1
    String get_name() const override { return name; }
62
63
19
    size_t get_number_of_arguments() const override { return 1; }
64
65
19
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
66
19
        return std::make_shared<DataTypeString>();
67
19
    }
68
69
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
70
22
                        uint32_t result, size_t input_rows_count) const override {
71
        // Get the compression algorithm object
72
22
        BlockCompressionCodec* compression_codec;
73
22
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
74
22
                                                    &compression_codec));
75
76
22
        const auto& arg_column =
77
22
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
78
22
        auto result_column = ColumnString::create();
79
80
22
        auto& arg_data = arg_column.get_chars();
81
22
        auto& arg_offset = arg_column.get_offsets();
82
22
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
83
84
22
        auto& col_data = result_column->get_chars();
85
22
        auto& col_offset = result_column->get_offsets();
86
22
        col_offset.resize(input_rows_count);
87
88
22
        faststring compressed_str;
89
22
        Slice data;
90
91
        // When the original string is large, the result is roughly this value
92
22
        size_t total = arg_offset[input_rows_count - 1];
93
22
        col_data.reserve(total / 1000);
94
95
45
        for (size_t row = 0; row < input_rows_count; row++) {
96
23
            uint32_t length = arg_offset[row] - arg_offset[row - 1];
97
23
            data = Slice(arg_begin + arg_offset[row - 1], length);
98
99
23
            size_t idx = col_data.size();
100
23
            if (!length) { // data is ''
101
7
                col_offset[row] = col_offset[row - 1];
102
7
                continue;
103
7
            }
104
105
            // Z_MEM_ERROR and Z_BUF_ERROR are already handled in compress, making sure st is always Z_OK
106
16
            RETURN_IF_ERROR(compression_codec->compress(data, &compressed_str));
107
16
            col_data.resize(col_data.size() + COMPRESS_STR_LENGTH + compressed_str.size());
108
109
16
            std::memcpy(col_data.data() + idx, &length, sizeof(length));
110
16
            idx += COMPRESS_STR_LENGTH;
111
112
            // The length of compress_str is not known in advance, so it cannot be compressed directly into col_data
113
16
            unsigned char* src = compressed_str.data();
114
295
            for (size_t i = 0; i < compressed_str.size(); idx++, i++, src++) {
115
279
                col_data[idx] = *src;
116
279
            }
117
16
            col_offset[row] =
118
16
                    col_offset[row - 1] + COMPRESS_STR_LENGTH + (int)compressed_str.size();
119
16
        }
120
121
22
        block.replace_by_position(result, std::move(result_column));
122
22
        return Status::OK();
123
22
    }
124
};
125
126
class FunctionUncompress : public IFunction {
127
public:
128
    static constexpr auto name = "uncompress";
129
27
    static FunctionPtr create() { return std::make_shared<FunctionUncompress>(); }
130
131
1
    String get_name() const override { return name; }
132
133
18
    size_t get_number_of_arguments() const override { return 1; }
134
135
18
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
136
18
        return make_nullable(std::make_shared<DataTypeString>());
137
18
    }
138
139
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
140
24
                        uint32_t result, size_t input_rows_count) const override {
141
        // Get the compression algorithm object
142
24
        BlockCompressionCodec* compression_codec;
143
24
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
144
24
                                                    &compression_codec));
145
146
24
        const auto& arg_column =
147
24
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
148
149
24
        auto& arg_data = arg_column.get_chars();
150
24
        auto& arg_offset = arg_column.get_offsets();
151
24
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
152
153
24
        auto result_column = ColumnString::create();
154
24
        auto& col_data = result_column->get_chars();
155
24
        auto& col_offset = result_column->get_offsets();
156
24
        col_offset.resize(input_rows_count);
157
158
24
        auto null_column = ColumnUInt8::create(input_rows_count);
159
24
        auto& null_map = null_column->get_data();
160
161
24
        std::string uncompressed;
162
24
        Slice data;
163
24
        Slice uncompressed_slice;
164
165
24
        size_t total = arg_offset[input_rows_count - 1];
166
24
        col_data.reserve(total * 1000);
167
168
49
        for (size_t row = 0; row < input_rows_count; row++) {
169
25
            null_map[row] = false;
170
25
            data = Slice(arg_begin + arg_offset[row - 1], arg_offset[row] - arg_offset[row - 1]);
171
25
            size_t data_length = arg_offset[row] - arg_offset[row - 1];
172
173
25
            if (data_length == 0) { // The original data is ''
174
5
                col_offset[row] = col_offset[row - 1];
175
5
                continue;
176
5
            }
177
178
20
            union {
179
20
                char bytes[COMPRESS_STR_LENGTH];
180
20
                uint32_t value;
181
20
            } length;
182
20
            std::memcpy(length.bytes, data.data, COMPRESS_STR_LENGTH);
183
184
20
            size_t idx = col_data.size();
185
20
            col_data.resize(col_data.size() + length.value);
186
20
            uncompressed_slice = Slice(col_data.data() + idx, length.value);
187
188
20
            Slice compressed_data(data.data + COMPRESS_STR_LENGTH, data.size - COMPRESS_STR_LENGTH);
189
20
            auto st = compression_codec->decompress(compressed_data, &uncompressed_slice);
190
191
20
            if (!st.ok()) {                                      // is not a legal compressed string
192
3
                col_data.resize(col_data.size() - length.value); // remove compressed_data
193
3
                col_offset[row] = col_offset[row - 1];
194
3
                null_map[row] = true;
195
3
                continue;
196
3
            }
197
17
            col_offset[row] = col_offset[row - 1] + length.value;
198
17
        }
199
200
24
        block.replace_by_position(
201
24
                result, ColumnNullable::create(std::move(result_column), std::move(null_column)));
202
24
        return Status::OK();
203
24
    }
204
};
205
206
8
void register_function_compress(SimpleFunctionFactory& factory) {
207
8
    factory.register_function<FunctionCompress>();
208
8
    factory.register_function<FunctionUncompress>();
209
8
}
210
#include "common/compile_check_end.h"
211
} // namespace doris