Coverage Report

Created: 2026-04-11 14:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_string_mask.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <fmt/format.h>
19
20
#include <algorithm>
21
#include <cstddef>
22
23
#include "common/status.h"
24
#include "core/assert_cast.h"
25
#include "core/block/block.h"
26
#include "core/block/column_numbers.h"
27
#include "core/column/column_const.h"
28
#include "core/column/column_string.h"
29
#include "core/column/column_vector.h"
30
#include "core/data_type/data_type_string.h"
31
#include "core/memcpy_small.h"
32
#include "core/string_ref.h"
33
#include "exprs/function/function.h"
34
#include "exprs/function/function_helpers.h"
35
#include "exprs/function/simple_function_factory.h"
36
#include "exprs/function_context.h"
37
38
namespace doris {
39
#include "common/compile_check_avoid_begin.h"
40
41
template <bool Reverse>
42
class FunctionMaskPartial;
43
44
class FunctionMask : public IFunction {
45
public:
46
    static constexpr auto name = "mask";
47
    static constexpr unsigned char DEFAULT_UPPER_MASK = 'X';
48
    static constexpr unsigned char DEFAULT_LOWER_MASK = 'x';
49
    static constexpr unsigned char DEFAULT_NUMBER_MASK = 'n';
50
5
    String get_name() const override { return name; }
51
59
    static FunctionPtr create() { return std::make_shared<FunctionMask>(); }
52
53
50
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
54
50
        return std::make_shared<DataTypeString>();
55
50
    }
56
57
0
    size_t get_number_of_arguments() const override { return 0; }
58
59
144
    ColumnNumbers get_arguments_that_are_always_constant() const override { return {1, 2, 3}; }
60
61
51
    bool is_variadic() const override { return true; }
62
63
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
64
95
                        uint32_t result, size_t input_rows_count) const override {
65
95
        DCHECK_GE(arguments.size(), 1);
66
95
        DCHECK_LE(arguments.size(), 4);
67
68
95
        char upper = DEFAULT_UPPER_MASK, lower = DEFAULT_LOWER_MASK, number = DEFAULT_NUMBER_MASK;
69
70
95
        auto res = ColumnString::create();
71
95
        const auto& source_column =
72
95
                assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column);
73
74
95
        if (arguments.size() > 1) {
75
38
            const auto& col = *block.get_by_position(arguments[1]).column;
76
38
            auto string_ref = col.get_data_at(0);
77
38
            if (string_ref.size > 0) {
78
38
                upper = *string_ref.data;
79
38
            }
80
38
        }
81
82
95
        if (arguments.size() > 2) {
83
22
            const auto& col = *block.get_by_position(arguments[2]).column;
84
22
            auto string_ref = col.get_data_at(0);
85
22
            if (string_ref.size > 0) {
86
22
                lower = *string_ref.data;
87
22
            }
88
22
        }
89
90
95
        if (arguments.size() > 3) {
91
12
            const auto& col = *block.get_by_position(arguments[3]).column;
92
12
            auto string_ref = col.get_data_at(0);
93
12
            if (string_ref.size > 0) {
94
12
                number = *string_ref.data;
95
12
            }
96
12
        }
97
98
95
        if (arguments.size() > 4) {
99
0
            return Status::InvalidArgument(
100
0
                    fmt::format("too many arguments for function {}", get_name()));
101
0
        }
102
103
95
        vector_mask(source_column, *res, upper, lower, number);
104
105
95
        block.get_by_position(result).column = std::move(res);
106
107
95
        return Status::OK();
108
95
    }
109
    friend class FunctionMaskPartial<true>;
110
    friend class FunctionMaskPartial<false>;
111
112
private:
113
    static void vector_mask(const ColumnString& source, ColumnString& result, const char upper,
114
163
                            const char lower, const char number) {
115
163
        result.get_chars().resize(source.get_chars().size());
116
163
        result.get_offsets().resize(source.get_offsets().size());
117
163
        memcpy_small_allow_read_write_overflow15(
118
163
                result.get_offsets().data(), source.get_offsets().data(),
119
163
                source.get_offsets().size() * sizeof(ColumnString::Offset));
120
121
163
        const unsigned char* src = source.get_chars().data();
122
163
        const size_t size = source.get_chars().size();
123
163
        unsigned char* res = result.get_chars().data();
124
163
        mask(src, size, upper, lower, number, res);
125
163
    }
126
127
    static void mask(const unsigned char* __restrict src, const size_t size,
128
                     const unsigned char upper, const unsigned char lower,
129
314
                     const unsigned char number, unsigned char* __restrict res) {
130
3.67k
        for (size_t i = 0; i != size; ++i) {
131
3.36k
            auto c = src[i];
132
3.36k
            if (c >= 'A' && c <= 'Z') {
133
260
                res[i] = upper;
134
3.10k
            } else if (c >= 'a' && c <= 'z') {
135
1.81k
                res[i] = lower;
136
1.81k
            } else if (c >= '0' && c <= '9') {
137
1.12k
                res[i] = number;
138
1.12k
            } else {
139
158
                res[i] = c;
140
158
            }
141
3.36k
        }
142
314
    }
143
};
144
145
template <bool Reverse>
146
class FunctionMaskPartial : public IFunction {
147
public:
148
    static constexpr auto name = Reverse ? "mask_last_n" : "mask_first_n";
149
0
    String get_name() const override { return name; }
Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb1EE8get_nameB5cxx11Ev
Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb0EE8get_nameB5cxx11Ev
150
120
    static FunctionPtr create() { return std::make_shared<FunctionMaskPartial>(); }
_ZN5doris19FunctionMaskPartialILb1EE6createEv
Line
Count
Source
150
60
    static FunctionPtr create() { return std::make_shared<FunctionMaskPartial>(); }
_ZN5doris19FunctionMaskPartialILb0EE6createEv
Line
Count
Source
150
60
    static FunctionPtr create() { return std::make_shared<FunctionMaskPartial>(); }
151
152
102
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
153
102
        return std::make_shared<DataTypeString>();
154
102
    }
_ZNK5doris19FunctionMaskPartialILb1EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS6_EE
Line
Count
Source
152
51
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
153
51
        return std::make_shared<DataTypeString>();
154
51
    }
_ZNK5doris19FunctionMaskPartialILb0EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS6_EE
Line
Count
Source
152
51
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
153
51
        return std::make_shared<DataTypeString>();
154
51
    }
155
156
0
    size_t get_number_of_arguments() const override { return 0; }
Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb1EE23get_number_of_argumentsEv
Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb0EE23get_number_of_argumentsEv
157
158
104
    bool is_variadic() const override { return true; }
_ZNK5doris19FunctionMaskPartialILb1EE11is_variadicEv
Line
Count
Source
158
52
    bool is_variadic() const override { return true; }
_ZNK5doris19FunctionMaskPartialILb0EE11is_variadicEv
Line
Count
Source
158
52
    bool is_variadic() const override { return true; }
159
160
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
161
206
                        uint32_t result, size_t input_rows_count) const override {
162
206
        auto res = ColumnString::create();
163
206
        auto col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
164
206
        const auto& source_column = assert_cast<const ColumnString&>(*col);
165
166
206
        if (arguments.size() == 1) { // no 2nd arg, just mask all
167
70
            FunctionMask::vector_mask(source_column, *res, FunctionMask::DEFAULT_UPPER_MASK,
168
70
                                      FunctionMask::DEFAULT_LOWER_MASK,
169
70
                                      FunctionMask::DEFAULT_NUMBER_MASK);
170
136
        } else {
171
136
            const auto& [col_2nd, is_const] =
172
136
                    unpack_if_const(block.get_by_position(arguments[1]).column);
173
174
136
            const auto& col_n = assert_cast<const ColumnInt32&>(*col_2nd);
175
176
136
            if (is_const) {
177
64
                RETURN_IF_ERROR(vector<true>(source_column, col_n, *res));
178
72
            } else {
179
72
                RETURN_IF_ERROR(vector<false>(source_column, col_n, *res));
180
72
            }
181
136
        }
182
183
196
        block.get_by_position(result).column = std::move(res);
184
185
196
        return Status::OK();
186
206
    }
_ZNK5doris19FunctionMaskPartialILb1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Line
Count
Source
161
103
                        uint32_t result, size_t input_rows_count) const override {
162
103
        auto res = ColumnString::create();
163
103
        auto col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
164
103
        const auto& source_column = assert_cast<const ColumnString&>(*col);
165
166
103
        if (arguments.size() == 1) { // no 2nd arg, just mask all
167
35
            FunctionMask::vector_mask(source_column, *res, FunctionMask::DEFAULT_UPPER_MASK,
168
35
                                      FunctionMask::DEFAULT_LOWER_MASK,
169
35
                                      FunctionMask::DEFAULT_NUMBER_MASK);
170
68
        } else {
171
68
            const auto& [col_2nd, is_const] =
172
68
                    unpack_if_const(block.get_by_position(arguments[1]).column);
173
174
68
            const auto& col_n = assert_cast<const ColumnInt32&>(*col_2nd);
175
176
68
            if (is_const) {
177
32
                RETURN_IF_ERROR(vector<true>(source_column, col_n, *res));
178
36
            } else {
179
36
                RETURN_IF_ERROR(vector<false>(source_column, col_n, *res));
180
36
            }
181
68
        }
182
183
98
        block.get_by_position(result).column = std::move(res);
184
185
98
        return Status::OK();
186
103
    }
_ZNK5doris19FunctionMaskPartialILb0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Line
Count
Source
161
103
                        uint32_t result, size_t input_rows_count) const override {
162
103
        auto res = ColumnString::create();
163
103
        auto col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
164
103
        const auto& source_column = assert_cast<const ColumnString&>(*col);
165
166
103
        if (arguments.size() == 1) { // no 2nd arg, just mask all
167
35
            FunctionMask::vector_mask(source_column, *res, FunctionMask::DEFAULT_UPPER_MASK,
168
35
                                      FunctionMask::DEFAULT_LOWER_MASK,
169
35
                                      FunctionMask::DEFAULT_NUMBER_MASK);
170
68
        } else {
171
68
            const auto& [col_2nd, is_const] =
172
68
                    unpack_if_const(block.get_by_position(arguments[1]).column);
173
174
68
            const auto& col_n = assert_cast<const ColumnInt32&>(*col_2nd);
175
176
68
            if (is_const) {
177
32
                RETURN_IF_ERROR(vector<true>(source_column, col_n, *res));
178
36
            } else {
179
36
                RETURN_IF_ERROR(vector<false>(source_column, col_n, *res));
180
36
            }
181
68
        }
182
183
98
        block.get_by_position(result).column = std::move(res);
184
185
98
        return Status::OK();
186
103
    }
187
188
private:
189
    template <bool is_const>
190
138
    static Status vector(const ColumnString& src, const ColumnInt32& col_n, ColumnString& result) {
191
138
        const auto num_rows = src.size();
192
138
        const auto* chars = src.get_chars().data();
193
138
        const auto* offsets = src.get_offsets().data();
194
138
        result.get_chars().resize(src.get_chars().size());
195
138
        result.get_offsets().resize(src.get_offsets().size());
196
138
        memcpy_small_allow_read_write_overflow15(
197
138
                result.get_offsets().data(), src.get_offsets().data(),
198
138
                src.get_offsets().size() * sizeof(ColumnString::Offset));
199
138
        auto* res = result.get_chars().data();
200
201
138
        const auto& col_n_data = col_n.get_data();
202
203
286
        for (ssize_t i = 0; i != num_rows; ++i) {
204
158
            auto offset = offsets[i - 1];
205
158
            int len = offsets[i] - offset;
206
158
            const int n = col_n_data[index_check_const<is_const>(i)];
207
208
158
            if (n < 0) [[unlikely]] {
209
10
                return Status::InvalidArgument(
210
10
                        "function {} only accept non-negative input for 2nd argument but got {}",
211
10
                        name, n);
212
10
            }
213
214
148
            if constexpr (Reverse) {
215
74
                auto start = std::max(len - n, 0);
216
74
                if (start > 0) {
217
48
                    memcpy(&res[offset], &chars[offset], start);
218
48
                }
219
74
                offset += start;
220
74
            } else {
221
74
                if (n < len) {
222
48
                    memcpy(&res[offset + n], &chars[offset + n], len - n);
223
48
                }
224
74
            }
225
226
148
            len = std::min(n, len);
227
148
            FunctionMask::mask(&chars[offset], len, FunctionMask::DEFAULT_UPPER_MASK,
228
148
                               FunctionMask::DEFAULT_LOWER_MASK, FunctionMask::DEFAULT_NUMBER_MASK,
229
148
                               &res[offset]);
230
148
        }
231
232
128
        return Status::OK();
233
138
    }
_ZN5doris19FunctionMaskPartialILb1EE6vectorILb1EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_
Line
Count
Source
190
32
    static Status vector(const ColumnString& src, const ColumnInt32& col_n, ColumnString& result) {
191
32
        const auto num_rows = src.size();
192
32
        const auto* chars = src.get_chars().data();
193
32
        const auto* offsets = src.get_offsets().data();
194
32
        result.get_chars().resize(src.get_chars().size());
195
32
        result.get_offsets().resize(src.get_offsets().size());
196
32
        memcpy_small_allow_read_write_overflow15(
197
32
                result.get_offsets().data(), src.get_offsets().data(),
198
32
                src.get_offsets().size() * sizeof(ColumnString::Offset));
199
32
        auto* res = result.get_chars().data();
200
201
32
        const auto& col_n_data = col_n.get_data();
202
203
72
        for (ssize_t i = 0; i != num_rows; ++i) {
204
40
            auto offset = offsets[i - 1];
205
40
            int len = offsets[i] - offset;
206
40
            const int n = col_n_data[index_check_const<is_const>(i)];
207
208
40
            if (n < 0) [[unlikely]] {
209
0
                return Status::InvalidArgument(
210
0
                        "function {} only accept non-negative input for 2nd argument but got {}",
211
0
                        name, n);
212
0
            }
213
214
40
            if constexpr (Reverse) {
215
40
                auto start = std::max(len - n, 0);
216
40
                if (start > 0) {
217
20
                    memcpy(&res[offset], &chars[offset], start);
218
20
                }
219
40
                offset += start;
220
            } else {
221
                if (n < len) {
222
                    memcpy(&res[offset + n], &chars[offset + n], len - n);
223
                }
224
            }
225
226
40
            len = std::min(n, len);
227
40
            FunctionMask::mask(&chars[offset], len, FunctionMask::DEFAULT_UPPER_MASK,
228
40
                               FunctionMask::DEFAULT_LOWER_MASK, FunctionMask::DEFAULT_NUMBER_MASK,
229
40
                               &res[offset]);
230
40
        }
231
232
32
        return Status::OK();
233
32
    }
_ZN5doris19FunctionMaskPartialILb1EE6vectorILb0EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_
Line
Count
Source
190
37
    static Status vector(const ColumnString& src, const ColumnInt32& col_n, ColumnString& result) {
191
37
        const auto num_rows = src.size();
192
37
        const auto* chars = src.get_chars().data();
193
37
        const auto* offsets = src.get_offsets().data();
194
37
        result.get_chars().resize(src.get_chars().size());
195
37
        result.get_offsets().resize(src.get_offsets().size());
196
37
        memcpy_small_allow_read_write_overflow15(
197
37
                result.get_offsets().data(), src.get_offsets().data(),
198
37
                src.get_offsets().size() * sizeof(ColumnString::Offset));
199
37
        auto* res = result.get_chars().data();
200
201
37
        const auto& col_n_data = col_n.get_data();
202
203
71
        for (ssize_t i = 0; i != num_rows; ++i) {
204
39
            auto offset = offsets[i - 1];
205
39
            int len = offsets[i] - offset;
206
39
            const int n = col_n_data[index_check_const<is_const>(i)];
207
208
39
            if (n < 0) [[unlikely]] {
209
5
                return Status::InvalidArgument(
210
5
                        "function {} only accept non-negative input for 2nd argument but got {}",
211
5
                        name, n);
212
5
            }
213
214
34
            if constexpr (Reverse) {
215
34
                auto start = std::max(len - n, 0);
216
34
                if (start > 0) {
217
28
                    memcpy(&res[offset], &chars[offset], start);
218
28
                }
219
34
                offset += start;
220
            } else {
221
                if (n < len) {
222
                    memcpy(&res[offset + n], &chars[offset + n], len - n);
223
                }
224
            }
225
226
34
            len = std::min(n, len);
227
34
            FunctionMask::mask(&chars[offset], len, FunctionMask::DEFAULT_UPPER_MASK,
228
34
                               FunctionMask::DEFAULT_LOWER_MASK, FunctionMask::DEFAULT_NUMBER_MASK,
229
34
                               &res[offset]);
230
34
        }
231
232
32
        return Status::OK();
233
37
    }
_ZN5doris19FunctionMaskPartialILb0EE6vectorILb1EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_
Line
Count
Source
190
32
    static Status vector(const ColumnString& src, const ColumnInt32& col_n, ColumnString& result) {
191
32
        const auto num_rows = src.size();
192
32
        const auto* chars = src.get_chars().data();
193
32
        const auto* offsets = src.get_offsets().data();
194
32
        result.get_chars().resize(src.get_chars().size());
195
32
        result.get_offsets().resize(src.get_offsets().size());
196
32
        memcpy_small_allow_read_write_overflow15(
197
32
                result.get_offsets().data(), src.get_offsets().data(),
198
32
                src.get_offsets().size() * sizeof(ColumnString::Offset));
199
32
        auto* res = result.get_chars().data();
200
201
32
        const auto& col_n_data = col_n.get_data();
202
203
72
        for (ssize_t i = 0; i != num_rows; ++i) {
204
40
            auto offset = offsets[i - 1];
205
40
            int len = offsets[i] - offset;
206
40
            const int n = col_n_data[index_check_const<is_const>(i)];
207
208
40
            if (n < 0) [[unlikely]] {
209
0
                return Status::InvalidArgument(
210
0
                        "function {} only accept non-negative input for 2nd argument but got {}",
211
0
                        name, n);
212
0
            }
213
214
            if constexpr (Reverse) {
215
                auto start = std::max(len - n, 0);
216
                if (start > 0) {
217
                    memcpy(&res[offset], &chars[offset], start);
218
                }
219
                offset += start;
220
40
            } else {
221
40
                if (n < len) {
222
20
                    memcpy(&res[offset + n], &chars[offset + n], len - n);
223
20
                }
224
40
            }
225
226
40
            len = std::min(n, len);
227
40
            FunctionMask::mask(&chars[offset], len, FunctionMask::DEFAULT_UPPER_MASK,
228
40
                               FunctionMask::DEFAULT_LOWER_MASK, FunctionMask::DEFAULT_NUMBER_MASK,
229
40
                               &res[offset]);
230
40
        }
231
232
32
        return Status::OK();
233
32
    }
_ZN5doris19FunctionMaskPartialILb0EE6vectorILb0EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_
Line
Count
Source
190
37
    static Status vector(const ColumnString& src, const ColumnInt32& col_n, ColumnString& result) {
191
37
        const auto num_rows = src.size();
192
37
        const auto* chars = src.get_chars().data();
193
37
        const auto* offsets = src.get_offsets().data();
194
37
        result.get_chars().resize(src.get_chars().size());
195
37
        result.get_offsets().resize(src.get_offsets().size());
196
37
        memcpy_small_allow_read_write_overflow15(
197
37
                result.get_offsets().data(), src.get_offsets().data(),
198
37
                src.get_offsets().size() * sizeof(ColumnString::Offset));
199
37
        auto* res = result.get_chars().data();
200
201
37
        const auto& col_n_data = col_n.get_data();
202
203
71
        for (ssize_t i = 0; i != num_rows; ++i) {
204
39
            auto offset = offsets[i - 1];
205
39
            int len = offsets[i] - offset;
206
39
            const int n = col_n_data[index_check_const<is_const>(i)];
207
208
39
            if (n < 0) [[unlikely]] {
209
5
                return Status::InvalidArgument(
210
5
                        "function {} only accept non-negative input for 2nd argument but got {}",
211
5
                        name, n);
212
5
            }
213
214
            if constexpr (Reverse) {
215
                auto start = std::max(len - n, 0);
216
                if (start > 0) {
217
                    memcpy(&res[offset], &chars[offset], start);
218
                }
219
                offset += start;
220
34
            } else {
221
34
                if (n < len) {
222
28
                    memcpy(&res[offset + n], &chars[offset + n], len - n);
223
28
                }
224
34
            }
225
226
34
            len = std::min(n, len);
227
34
            FunctionMask::mask(&chars[offset], len, FunctionMask::DEFAULT_UPPER_MASK,
228
34
                               FunctionMask::DEFAULT_LOWER_MASK, FunctionMask::DEFAULT_NUMBER_MASK,
229
34
                               &res[offset]);
230
34
        }
231
232
32
        return Status::OK();
233
37
    }
234
};
235
236
8
void register_function_string_mask(SimpleFunctionFactory& factory) {
237
8
    factory.register_function<FunctionMask>();
238
8
    factory.register_function<FunctionMaskPartial<true>>();
239
8
    factory.register_function<FunctionMaskPartial<false>>();
240
8
}
241
242
#include "common/compile_check_avoid_end.h"
243
} // namespace doris