Coverage Report

Created: 2026-04-11 00:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_compress.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
#include <glog/logging.h>
18
19
#include <array>
20
#include <cctype>
21
#include <cstddef>
22
#include <cstring>
23
#include <functional>
24
#include <memory>
25
#include <string>
26
#include <utility>
27
28
#include "common/status.h"
29
#include "core/assert_cast.h"
30
#include "core/block/block.h"
31
#include "core/block/column_numbers.h"
32
#include "core/block/column_with_type_and_name.h"
33
#include "core/column/column.h"
34
#include "core/column/column_nullable.h"
35
#include "core/column/column_string.h"
36
#include "core/column/column_vector.h"
37
#include "core/data_type/data_type.h"
38
#include "core/data_type/data_type_nullable.h"
39
#include "core/data_type/data_type_number.h"
40
#include "core/data_type/data_type_string.h"
41
#include "core/types.h"
42
#include "exprs/aggregate/aggregate_function.h"
43
#include "exprs/function/function.h"
44
#include "exprs/function/simple_function_factory.h"
45
#include "util/block_compression.h"
46
#include "util/faststring.h"
47
48
namespace doris {
49
class FunctionContext;
50
} // namespace doris
51
52
namespace doris {
53
static constexpr int COMPRESS_STR_LENGTH = 4;
54
55
class FunctionCompress : public IFunction {
56
public:
57
    static constexpr auto name = "compress";
58
2
    static FunctionPtr create() { return std::make_shared<FunctionCompress>(); }
59
60
1
    String get_name() const override { return name; }
61
62
0
    size_t get_number_of_arguments() const override { return 1; }
63
64
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
65
0
        return std::make_shared<DataTypeString>();
66
0
    }
67
68
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
69
0
                        uint32_t result, size_t input_rows_count) const override {
70
        // Get the compression algorithm object
71
0
        BlockCompressionCodec* compression_codec;
72
0
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
73
0
                                                    &compression_codec));
74
75
0
        const auto& arg_column =
76
0
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
77
0
        auto result_column = ColumnString::create();
78
79
0
        auto& arg_data = arg_column.get_chars();
80
0
        auto& arg_offset = arg_column.get_offsets();
81
0
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
82
83
0
        auto& col_data = result_column->get_chars();
84
0
        auto& col_offset = result_column->get_offsets();
85
0
        col_offset.resize(input_rows_count);
86
87
0
        faststring compressed_str;
88
0
        Slice data;
89
90
        // When the original string is large, the result is roughly this value
91
0
        size_t total = arg_offset[input_rows_count - 1];
92
0
        col_data.reserve(total / 1000);
93
94
0
        for (size_t row = 0; row < input_rows_count; row++) {
95
0
            uint32_t length = arg_offset[row] - arg_offset[row - 1];
96
0
            data = Slice(arg_begin + arg_offset[row - 1], length);
97
98
0
            size_t idx = col_data.size();
99
0
            if (!length) { // data is ''
100
0
                col_offset[row] = col_offset[row - 1];
101
0
                continue;
102
0
            }
103
104
            // Z_MEM_ERROR and Z_BUF_ERROR are already handled in compress, making sure st is always Z_OK
105
0
            RETURN_IF_ERROR(compression_codec->compress(data, &compressed_str));
106
0
            col_data.resize(col_data.size() + COMPRESS_STR_LENGTH + compressed_str.size());
107
108
0
            std::memcpy(col_data.data() + idx, &length, sizeof(length));
109
0
            idx += COMPRESS_STR_LENGTH;
110
111
            // The length of compress_str is not known in advance, so it cannot be compressed directly into col_data
112
0
            unsigned char* src = compressed_str.data();
113
0
            for (size_t i = 0; i < compressed_str.size(); idx++, i++, src++) {
114
0
                col_data[idx] = *src;
115
0
            }
116
0
            col_offset[row] =
117
0
                    col_offset[row - 1] + COMPRESS_STR_LENGTH + (int)compressed_str.size();
118
0
        }
119
120
0
        block.replace_by_position(result, std::move(result_column));
121
0
        return Status::OK();
122
0
    }
123
};
124
125
class FunctionUncompress : public IFunction {
126
public:
127
    static constexpr auto name = "uncompress";
128
2
    static FunctionPtr create() { return std::make_shared<FunctionUncompress>(); }
129
130
1
    String get_name() const override { return name; }
131
132
0
    size_t get_number_of_arguments() const override { return 1; }
133
134
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
135
0
        return make_nullable(std::make_shared<DataTypeString>());
136
0
    }
137
138
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
139
0
                        uint32_t result, size_t input_rows_count) const override {
140
        // Get the compression algorithm object
141
0
        BlockCompressionCodec* compression_codec;
142
0
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
143
0
                                                    &compression_codec));
144
145
0
        const auto& arg_column =
146
0
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
147
148
0
        auto& arg_data = arg_column.get_chars();
149
0
        auto& arg_offset = arg_column.get_offsets();
150
0
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
151
152
0
        auto result_column = ColumnString::create();
153
0
        auto& col_data = result_column->get_chars();
154
0
        auto& col_offset = result_column->get_offsets();
155
0
        col_offset.resize(input_rows_count);
156
157
0
        auto null_column = ColumnUInt8::create(input_rows_count);
158
0
        auto& null_map = null_column->get_data();
159
160
0
        std::string uncompressed;
161
0
        Slice data;
162
0
        Slice uncompressed_slice;
163
164
0
        size_t total = arg_offset[input_rows_count - 1];
165
0
        col_data.reserve(total * 1000);
166
167
0
        for (size_t row = 0; row < input_rows_count; row++) {
168
0
            null_map[row] = false;
169
0
            data = Slice(arg_begin + arg_offset[row - 1], arg_offset[row] - arg_offset[row - 1]);
170
0
            size_t data_length = arg_offset[row] - arg_offset[row - 1];
171
172
0
            if (data_length == 0) { // The original data is ''
173
0
                col_offset[row] = col_offset[row - 1];
174
0
                continue;
175
0
            }
176
177
0
            union {
178
0
                char bytes[COMPRESS_STR_LENGTH];
179
0
                uint32_t value;
180
0
            } length;
181
0
            std::memcpy(length.bytes, data.data, COMPRESS_STR_LENGTH);
182
183
0
            size_t idx = col_data.size();
184
0
            col_data.resize(col_data.size() + length.value);
185
0
            uncompressed_slice = Slice(col_data.data() + idx, length.value);
186
187
0
            Slice compressed_data(data.data + COMPRESS_STR_LENGTH, data.size - COMPRESS_STR_LENGTH);
188
0
            auto st = compression_codec->decompress(compressed_data, &uncompressed_slice);
189
190
0
            if (!st.ok()) {                                      // is not a legal compressed string
191
0
                col_data.resize(col_data.size() - length.value); // remove compressed_data
192
0
                col_offset[row] = col_offset[row - 1];
193
0
                null_map[row] = true;
194
0
                continue;
195
0
            }
196
0
            col_offset[row] = col_offset[row - 1] + length.value;
197
0
        }
198
199
0
        block.replace_by_position(
200
0
                result, ColumnNullable::create(std::move(result_column), std::move(null_column)));
201
0
        return Status::OK();
202
0
    }
203
};
204
205
1
void register_function_compress(SimpleFunctionFactory& factory) {
206
1
    factory.register_function<FunctionCompress>();
207
1
    factory.register_function<FunctionUncompress>();
208
1
}
209
} // namespace doris