Coverage Report

Created: 2026-03-14 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_compress.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
#include <glog/logging.h>
18
19
#include <array>
20
#include <cctype>
21
#include <cstddef>
22
#include <cstring>
23
#include <functional>
24
#include <memory>
25
#include <string>
26
#include <utility>
27
28
#include "common/status.h"
29
#include "core/assert_cast.h"
30
#include "core/block/block.h"
31
#include "core/block/column_numbers.h"
32
#include "core/block/column_with_type_and_name.h"
33
#include "core/column/column.h"
34
#include "core/column/column_nullable.h"
35
#include "core/column/column_string.h"
36
#include "core/column/column_vector.h"
37
#include "core/data_type/data_type.h"
38
#include "core/data_type/data_type_nullable.h"
39
#include "core/data_type/data_type_number.h"
40
#include "core/data_type/data_type_string.h"
41
#include "core/types.h"
42
#include "exprs/aggregate/aggregate_function.h"
43
#include "exprs/function/function.h"
44
#include "exprs/function/simple_function_factory.h"
45
#include "util/block_compression.h"
46
#include "util/faststring.h"
47
48
namespace doris {
49
class FunctionContext;
50
} // namespace doris
51
52
namespace doris {
53
#include "common/compile_check_begin.h"
54
static constexpr int COMPRESS_STR_LENGTH = 4;
55
56
class FunctionCompress : public IFunction {
57
public:
58
    static constexpr auto name = "compress";
59
2
    static FunctionPtr create() { return std::make_shared<FunctionCompress>(); }
60
61
1
    String get_name() const override { return name; }
62
63
0
    size_t get_number_of_arguments() const override { return 1; }
64
65
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
66
0
        return std::make_shared<DataTypeString>();
67
0
    }
68
69
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
70
0
                        uint32_t result, size_t input_rows_count) const override {
71
        // Get the compression algorithm object
72
0
        BlockCompressionCodec* compression_codec;
73
0
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
74
0
                                                    &compression_codec));
75
76
0
        const auto& arg_column =
77
0
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
78
0
        auto result_column = ColumnString::create();
79
80
0
        auto& arg_data = arg_column.get_chars();
81
0
        auto& arg_offset = arg_column.get_offsets();
82
0
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
83
84
0
        auto& col_data = result_column->get_chars();
85
0
        auto& col_offset = result_column->get_offsets();
86
0
        col_offset.resize(input_rows_count);
87
88
0
        faststring compressed_str;
89
0
        Slice data;
90
91
        // When the original string is large, the result is roughly this value
92
0
        size_t total = arg_offset[input_rows_count - 1];
93
0
        col_data.reserve(total / 1000);
94
95
0
        for (size_t row = 0; row < input_rows_count; row++) {
96
0
            uint32_t length = arg_offset[row] - arg_offset[row - 1];
97
0
            data = Slice(arg_begin + arg_offset[row - 1], length);
98
99
0
            size_t idx = col_data.size();
100
0
            if (!length) { // data is ''
101
0
                col_offset[row] = col_offset[row - 1];
102
0
                continue;
103
0
            }
104
105
            // Z_MEM_ERROR and Z_BUF_ERROR are already handled in compress, making sure st is always Z_OK
106
0
            RETURN_IF_ERROR(compression_codec->compress(data, &compressed_str));
107
0
            col_data.resize(col_data.size() + COMPRESS_STR_LENGTH + compressed_str.size());
108
109
0
            std::memcpy(col_data.data() + idx, &length, sizeof(length));
110
0
            idx += COMPRESS_STR_LENGTH;
111
112
            // The length of compress_str is not known in advance, so it cannot be compressed directly into col_data
113
0
            unsigned char* src = compressed_str.data();
114
0
            for (size_t i = 0; i < compressed_str.size(); idx++, i++, src++) {
115
0
                col_data[idx] = *src;
116
0
            }
117
0
            col_offset[row] =
118
0
                    col_offset[row - 1] + COMPRESS_STR_LENGTH + (int)compressed_str.size();
119
0
        }
120
121
0
        block.replace_by_position(result, std::move(result_column));
122
0
        return Status::OK();
123
0
    }
124
};
125
126
class FunctionUncompress : public IFunction {
127
public:
128
    static constexpr auto name = "uncompress";
129
2
    static FunctionPtr create() { return std::make_shared<FunctionUncompress>(); }
130
131
1
    String get_name() const override { return name; }
132
133
0
    size_t get_number_of_arguments() const override { return 1; }
134
135
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
136
0
        return make_nullable(std::make_shared<DataTypeString>());
137
0
    }
138
139
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
140
0
                        uint32_t result, size_t input_rows_count) const override {
141
        // Get the compression algorithm object
142
0
        BlockCompressionCodec* compression_codec;
143
0
        RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::ZLIB,
144
0
                                                    &compression_codec));
145
146
0
        const auto& arg_column =
147
0
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
148
149
0
        auto& arg_data = arg_column.get_chars();
150
0
        auto& arg_offset = arg_column.get_offsets();
151
0
        const char* arg_begin = reinterpret_cast<const char*>(arg_data.data());
152
153
0
        auto result_column = ColumnString::create();
154
0
        auto& col_data = result_column->get_chars();
155
0
        auto& col_offset = result_column->get_offsets();
156
0
        col_offset.resize(input_rows_count);
157
158
0
        auto null_column = ColumnUInt8::create(input_rows_count);
159
0
        auto& null_map = null_column->get_data();
160
161
0
        std::string uncompressed;
162
0
        Slice data;
163
0
        Slice uncompressed_slice;
164
165
0
        size_t total = arg_offset[input_rows_count - 1];
166
0
        col_data.reserve(total * 1000);
167
168
0
        for (size_t row = 0; row < input_rows_count; row++) {
169
0
            null_map[row] = false;
170
0
            data = Slice(arg_begin + arg_offset[row - 1], arg_offset[row] - arg_offset[row - 1]);
171
0
            size_t data_length = arg_offset[row] - arg_offset[row - 1];
172
173
0
            if (data_length == 0) { // The original data is ''
174
0
                col_offset[row] = col_offset[row - 1];
175
0
                continue;
176
0
            }
177
178
0
            union {
179
0
                char bytes[COMPRESS_STR_LENGTH];
180
0
                uint32_t value;
181
0
            } length;
182
0
            std::memcpy(length.bytes, data.data, COMPRESS_STR_LENGTH);
183
184
0
            size_t idx = col_data.size();
185
0
            col_data.resize(col_data.size() + length.value);
186
0
            uncompressed_slice = Slice(col_data.data() + idx, length.value);
187
188
0
            Slice compressed_data(data.data + COMPRESS_STR_LENGTH, data.size - COMPRESS_STR_LENGTH);
189
0
            auto st = compression_codec->decompress(compressed_data, &uncompressed_slice);
190
191
0
            if (!st.ok()) {                                      // is not a legal compressed string
192
0
                col_data.resize(col_data.size() - length.value); // remove compressed_data
193
0
                col_offset[row] = col_offset[row - 1];
194
0
                null_map[row] = true;
195
0
                continue;
196
0
            }
197
0
            col_offset[row] = col_offset[row - 1] + length.value;
198
0
        }
199
200
0
        block.replace_by_position(
201
0
                result, ColumnNullable::create(std::move(result_column), std::move(null_column)));
202
0
        return Status::OK();
203
0
    }
204
};
205
206
1
void register_function_compress(SimpleFunctionFactory& factory) {
207
1
    factory.register_function<FunctionCompress>();
208
1
    factory.register_function<FunctionUncompress>();
209
1
}
210
#include "common/compile_check_end.h"
211
} // namespace doris