Coverage Report

Created: 2026-04-10 18:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_compress.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
#include <glog/logging.h>
18
19
#include <array>
20
#include <cctype>
21
#include <cstddef>
22
#include <cstring>
23
#include <functional>
24
#include <memory>
25
#include <string>
26
#include <utility>
27
28
#include "common/status.h"
29
#include "core/assert_cast.h"
30
#include "core/block/block.h"
31
#include "core/block/column_numbers.h"
32
#include "core/block/column_with_type_and_name.h"
33
#include "core/column/column.h"
34
#include "core/column/column_nullable.h"
35
#include "core/column/column_string.h"
36
#include "core/column/column_vector.h"
37
#include "core/data_type/data_type.h"
38
#include "core/data_type/data_type_nullable.h"
39
#include "core/data_type/data_type_number.h"
40
#include "core/data_type/data_type_string.h"
41
#include "core/types.h"
42
#include "exprs/aggregate/aggregate_function.h"
43
#include "exprs/function/function.h"
44
#include "exprs/function/simple_function_factory.h"
45
#include "util/block_compression.h"
46
#include "util/faststring.h"
47
48
namespace doris {
49
class FunctionContext;
50
} // namespace doris
51
52
namespace doris {
53
static constexpr int COMPRESS_STR_LENGTH = 4;
54
55
class FunctionCompress : public IFunction {
56
public:
57
    static constexpr auto name = "compress";
58
27
    static FunctionPtr create() { return std::make_shared<FunctionCompress>(); }
59
60
1
    String get_name() const override { return name; }
61
62
20
    size_t get_number_of_arguments() const override { return 1; }
63
64
20
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
65
20
        return std::make_shared<DataTypeString>();
66
20
    }
67
68
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
69
22
                        uint32_t result, size_t input_rows_count) const override {
70
        // Get the compression algorithm object
71
22
        BlockCompressionCodec* compression_codec;
72
22
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
73
22
                                                    &compression_codec));
74
75
22
        const auto& arg_column =
76
22
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
77
22
        auto result_column = ColumnString::create();
78
79
22
        auto& arg_data = arg_column.get_chars();
80
22
        auto& arg_offset = arg_column.get_offsets();
81
22
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
82
83
22
        auto& col_data = result_column->get_chars();
84
22
        auto& col_offset = result_column->get_offsets();
85
22
        col_offset.resize(input_rows_count);
86
87
22
        faststring compressed_str;
88
22
        Slice data;
89
90
        // When the original string is large, the result is roughly this value
91
22
        size_t total = arg_offset[input_rows_count - 1];
92
22
        col_data.reserve(total / 1000);
93
94
45
        for (size_t row = 0; row < input_rows_count; row++) {
95
23
            uint32_t length = arg_offset[row] - arg_offset[row - 1];
96
23
            data = Slice(arg_begin + arg_offset[row - 1], length);
97
98
23
            size_t idx = col_data.size();
99
23
            if (!length) { // data is ''
100
7
                col_offset[row] = col_offset[row - 1];
101
7
                continue;
102
7
            }
103
104
            // Z_MEM_ERROR and Z_BUF_ERROR are already handled in compress, making sure st is always Z_OK
105
16
            RETURN_IF_ERROR(compression_codec->compress(data, &compressed_str));
106
16
            col_data.resize(col_data.size() + COMPRESS_STR_LENGTH + compressed_str.size());
107
108
16
            std::memcpy(col_data.data() + idx, &length, sizeof(length));
109
16
            idx += COMPRESS_STR_LENGTH;
110
111
            // The length of compress_str is not known in advance, so it cannot be compressed directly into col_data
112
16
            unsigned char* src = compressed_str.data();
113
295
            for (size_t i = 0; i < compressed_str.size(); idx++, i++, src++) {
114
279
                col_data[idx] = *src;
115
279
            }
116
16
            col_offset[row] =
117
16
                    col_offset[row - 1] + COMPRESS_STR_LENGTH + (int)compressed_str.size();
118
16
        }
119
120
22
        block.replace_by_position(result, std::move(result_column));
121
22
        return Status::OK();
122
22
    }
123
};
124
125
class FunctionUncompress : public IFunction {
126
public:
127
    static constexpr auto name = "uncompress";
128
25
    static FunctionPtr create() { return std::make_shared<FunctionUncompress>(); }
129
130
1
    String get_name() const override { return name; }
131
132
18
    size_t get_number_of_arguments() const override { return 1; }
133
134
18
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
135
18
        return make_nullable(std::make_shared<DataTypeString>());
136
18
    }
137
138
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
139
24
                        uint32_t result, size_t input_rows_count) const override {
140
        // Get the compression algorithm object
141
24
        BlockCompressionCodec* compression_codec;
142
24
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
143
24
                                                    &compression_codec));
144
145
24
        const auto& arg_column =
146
24
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
147
148
24
        auto& arg_data = arg_column.get_chars();
149
24
        auto& arg_offset = arg_column.get_offsets();
150
24
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
151
152
24
        auto result_column = ColumnString::create();
153
24
        auto& col_data = result_column->get_chars();
154
24
        auto& col_offset = result_column->get_offsets();
155
24
        col_offset.resize(input_rows_count);
156
157
24
        auto null_column = ColumnUInt8::create(input_rows_count);
158
24
        auto& null_map = null_column->get_data();
159
160
24
        std::string uncompressed;
161
24
        Slice data;
162
24
        Slice uncompressed_slice;
163
164
24
        size_t total = arg_offset[input_rows_count - 1];
165
24
        col_data.reserve(total * 1000);
166
167
49
        for (size_t row = 0; row < input_rows_count; row++) {
168
25
            null_map[row] = false;
169
25
            data = Slice(arg_begin + arg_offset[row - 1], arg_offset[row] - arg_offset[row - 1]);
170
25
            size_t data_length = arg_offset[row] - arg_offset[row - 1];
171
172
25
            if (data_length == 0) { // The original data is ''
173
5
                col_offset[row] = col_offset[row - 1];
174
5
                continue;
175
5
            }
176
177
20
            union {
178
20
                char bytes[COMPRESS_STR_LENGTH];
179
20
                uint32_t value;
180
20
            } length;
181
20
            std::memcpy(length.bytes, data.data, COMPRESS_STR_LENGTH);
182
183
20
            size_t idx = col_data.size();
184
20
            col_data.resize(col_data.size() + length.value);
185
20
            uncompressed_slice = Slice(col_data.data() + idx, length.value);
186
187
20
            Slice compressed_data(data.data + COMPRESS_STR_LENGTH, data.size - COMPRESS_STR_LENGTH);
188
20
            auto st = compression_codec->decompress(compressed_data, &uncompressed_slice);
189
190
20
            if (!st.ok()) {                                      // is not a legal compressed string
191
3
                col_data.resize(col_data.size() - length.value); // remove compressed_data
192
3
                col_offset[row] = col_offset[row - 1];
193
3
                null_map[row] = true;
194
3
                continue;
195
3
            }
196
17
            col_offset[row] = col_offset[row - 1] + length.value;
197
17
        }
198
199
24
        block.replace_by_position(
200
24
                result, ColumnNullable::create(std::move(result_column), std::move(null_column)));
201
24
        return Status::OK();
202
24
    }
203
};
204
205
6
void register_function_compress(SimpleFunctionFactory& factory) {
206
6
    factory.register_function<FunctionCompress>();
207
6
    factory.register_function<FunctionUncompress>();
208
6
}
209
} // namespace doris