Coverage Report

Created: 2026-05-29 10:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_hamming_distance.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <algorithm>
19
#include <vector>
20
21
#include "common/status.h"
22
#include "core/column/column_nullable.h"
23
#include "core/column/column_string.h"
24
#include "core/data_type/data_type_number.h"
25
#include "core/string_ref.h"
26
#include "exprs/function/simple_function_factory.h"
27
#include "util/simd/vstring_function.h"
28
29
namespace doris {
30
31
class FunctionHammingDistance : public IFunction {
32
public:
33
    using ResultDataType = DataTypeInt64;
34
    using ResultPaddedPODArray = PaddedPODArray<Int64>;
35
    using ResultColumnType = ColumnVector<ResultDataType::PType>;
36
37
    static constexpr auto name = "hamming_distance";
38
39
46
    static FunctionPtr create() { return std::make_shared<FunctionHammingDistance>(); }
40
41
1
    String get_name() const override { return name; }
42
37
    size_t get_number_of_arguments() const override { return 2; }
43
44
37
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
45
37
        const bool has_nullable = std::ranges::any_of(
46
65
                arguments, [](const DataTypePtr& type) { return type->is_nullable(); });
47
37
        if (has_nullable) {
48
14
            return make_nullable(std::make_shared<ResultDataType>());
49
14
        }
50
23
        return std::make_shared<ResultDataType>();
51
37
    }
52
53
74
    bool use_default_implementation_for_nulls() const override { return false; }
54
55
    Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
56
37
                        uint32_t result, size_t input_rows_count) const override {
57
37
        const auto& [left_col, left_const] =
58
37
                unpack_if_const(block.get_by_position(arguments[0]).column);
59
37
        const auto& [right_col, right_const] =
60
37
                unpack_if_const(block.get_by_position(arguments[1]).column);
61
62
37
        const auto* left_nullable = check_and_get_column<ColumnNullable>(left_col.get());
63
37
        const auto* right_nullable = check_and_get_column<ColumnNullable>(right_col.get());
64
65
37
        const IColumn* left_nested =
66
37
                left_nullable ? &left_nullable->get_nested_column() : left_col.get();
67
37
        const IColumn* right_nested =
68
37
                right_nullable ? &right_nullable->get_nested_column() : right_col.get();
69
70
37
        const auto* left_str_col = assert_cast<const ColumnString*>(left_nested);
71
37
        const auto* right_str_col = assert_cast<const ColumnString*>(right_nested);
72
73
37
        auto res_col = ResultColumnType::create(input_rows_count);
74
37
        auto& res_data = res_col->get_data();
75
76
37
        const NullMap* left_null_map =
77
37
                left_nullable ? &left_nullable->get_null_map_data() : nullptr;
78
37
        const NullMap* right_null_map =
79
37
                right_nullable ? &right_nullable->get_null_map_data() : nullptr;
80
37
        const bool has_nullable = left_null_map != nullptr || right_null_map != nullptr;
81
82
37
        if (!has_nullable) {
83
23
            if (left_const) {
84
3
                RETURN_IF_ERROR(scalar_vector(left_str_col->get_data_at(0).trim_tail_padding_zero(),
85
3
                                              *right_str_col, res_data));
86
20
            } else if (right_const) {
87
4
                RETURN_IF_ERROR(vector_scalar(
88
4
                        *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(),
89
4
                        res_data));
90
16
            } else {
91
16
                RETURN_IF_ERROR(vector_vector(*left_str_col, *right_str_col, res_data));
92
16
            }
93
15
            block.replace_by_position(result, std::move(res_col));
94
15
            return Status::OK();
95
23
        }
96
97
14
        auto null_col = ColumnUInt8::create(input_rows_count, 0);
98
14
        auto& null_map = null_col->get_data();
99
14
        if (left_const) {
100
3
            if (left_null_map && (*left_null_map)[0]) {
101
0
                std::fill(null_map.begin(), null_map.end(), 1);
102
0
                block.replace_by_position(
103
0
                        result, ColumnNullable::create(std::move(res_col), std::move(null_col)));
104
0
                return Status::OK();
105
0
            }
106
107
3
            const auto left = left_str_col->get_data_at(0).trim_tail_padding_zero();
108
3
            RETURN_IF_ERROR(scalar_vector_nullable(left, *right_str_col, right_null_map, res_data,
109
3
                                                   null_map));
110
11
        } else if (right_const) {
111
4
            if (right_null_map && (*right_null_map)[0]) {
112
0
                std::fill(null_map.begin(), null_map.end(), 1);
113
0
                block.replace_by_position(
114
0
                        result, ColumnNullable::create(std::move(res_col), std::move(null_col)));
115
0
                return Status::OK();
116
0
            }
117
118
4
            RETURN_IF_ERROR(vector_scalar_nullable(
119
4
                    *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(),
120
4
                    left_null_map, res_data, null_map));
121
7
        } else {
122
20
            for (size_t i = 0; i < input_rows_count; ++i) {
123
13
                const bool left_is_null = left_null_map && (*left_null_map)[i];
124
13
                const bool right_is_null = right_null_map && (*right_null_map)[i];
125
13
                if (left_is_null || right_is_null) {
126
7
                    null_map[i] = 1;
127
7
                    res_data[i] = 0;
128
7
                    continue;
129
7
                }
130
131
6
                RETURN_IF_ERROR(hamming_distance(
132
6
                        left_str_col->get_data_at(i).trim_tail_padding_zero(),
133
6
                        right_str_col->get_data_at(i).trim_tail_padding_zero(), res_data[i], i));
134
6
            }
135
7
        }
136
137
11
        block.replace_by_position(result,
138
11
                                  ColumnNullable::create(std::move(res_col), std::move(null_col)));
139
11
        return Status::OK();
140
14
    }
141
142
private:
143
    static Status vector_vector(const ColumnString& lcol, const ColumnString& rcol,
144
16
                                ResultPaddedPODArray& res) {
145
16
        DCHECK_EQ(lcol.size(), rcol.size());
146
147
16
        const size_t size = lcol.size();
148
16
        res.resize(size);
149
16
        std::vector<size_t> left_offsets;
150
16
        std::vector<size_t> right_offsets;
151
30
        for (size_t i = 0; i < size; ++i) {
152
19
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
153
19
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
154
19
            RETURN_IF_ERROR(hamming_distance_with_offsets(
155
19
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), right,
156
19
                    right_offsets, false, simd::VStringFunctions::is_ascii(right), res[i], i));
157
19
        }
158
11
        return Status::OK();
159
16
    }
160
161
    static Status vector_scalar(const ColumnString& lcol, const StringRef& rdata,
162
4
                                ResultPaddedPODArray& res) {
163
4
        const size_t size = lcol.size();
164
4
        res.resize(size);
165
4
        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
166
4
        std::vector<size_t> right_offsets;
167
4
        simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
168
4
        std::vector<size_t> left_offsets;
169
6
        for (size_t i = 0; i < size; ++i) {
170
4
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
171
4
            RETURN_IF_ERROR(hamming_distance_with_offsets(
172
4
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata,
173
4
                    right_offsets, true, right_ascii, res[i], i));
174
4
        }
175
2
        return Status::OK();
176
4
    }
177
178
    static Status scalar_vector(const StringRef& ldata, const ColumnString& rcol,
179
3
                                ResultPaddedPODArray& res) {
180
3
        const size_t size = rcol.size();
181
3
        res.resize(size);
182
3
        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
183
3
        std::vector<size_t> left_offsets;
184
3
        simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
185
3
        std::vector<size_t> right_offsets;
186
5
        for (size_t i = 0; i < size; ++i) {
187
3
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
188
3
            RETURN_IF_ERROR(hamming_distance_with_offsets(
189
3
                    ldata, left_offsets, true, left_ascii, right, right_offsets, false,
190
3
                    simd::VStringFunctions::is_ascii(right), res[i], i));
191
3
        }
192
2
        return Status::OK();
193
3
    }
194
195
    static Status vector_scalar_nullable(const ColumnString& lcol, const StringRef& rdata,
196
                                         const NullMap* left_null_map, ResultPaddedPODArray& res,
197
4
                                         NullMap& null_map) {
198
4
        const size_t size = lcol.size();
199
4
        res.resize(size);
200
4
        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
201
4
        std::vector<size_t> right_offsets;
202
4
        simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
203
4
        std::vector<size_t> left_offsets;
204
8
        for (size_t i = 0; i < size; ++i) {
205
6
            if (left_null_map && (*left_null_map)[i]) {
206
1
                null_map[i] = 1;
207
1
                res[i] = 0;
208
1
                continue;
209
1
            }
210
211
5
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
212
5
            RETURN_IF_ERROR(hamming_distance_with_offsets(
213
5
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata,
214
5
                    right_offsets, true, right_ascii, res[i], i));
215
5
        }
216
2
        return Status::OK();
217
4
    }
218
219
    static Status scalar_vector_nullable(const StringRef& ldata, const ColumnString& rcol,
220
                                         const NullMap* right_null_map, ResultPaddedPODArray& res,
221
3
                                         NullMap& null_map) {
222
3
        const size_t size = rcol.size();
223
3
        res.resize(size);
224
3
        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
225
3
        std::vector<size_t> left_offsets;
226
3
        simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
227
3
        std::vector<size_t> right_offsets;
228
7
        for (size_t i = 0; i < size; ++i) {
229
5
            if (right_null_map && (*right_null_map)[i]) {
230
1
                null_map[i] = 1;
231
1
                res[i] = 0;
232
1
                continue;
233
1
            }
234
235
4
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
236
4
            RETURN_IF_ERROR(hamming_distance_with_offsets(
237
4
                    ldata, left_offsets, true, left_ascii, right, right_offsets, false,
238
4
                    simd::VStringFunctions::is_ascii(right), res[i], i));
239
4
        }
240
2
        return Status::OK();
241
3
    }
242
243
    static Status hamming_distance_ascii(const StringRef& left, const StringRef& right,
244
24
                                         Int64& result, size_t row) {
245
24
        if (left.size != right.size) {
246
7
            return Status::InvalidArgument(
247
7
                    "hamming_distance requires strings of the same length at row {}", row);
248
7
        }
249
250
17
        Int64 distance = 0;
251
69
        for (size_t i = 0; i < left.size; ++i) {
252
52
            distance += static_cast<Int64>(left.data[i] != right.data[i]);
253
52
        }
254
17
        result = distance;
255
17
        return Status::OK();
256
24
    }
257
258
    static Status hamming_distance_utf8(const StringRef& left,
259
                                        const std::vector<size_t>& left_offsets,
260
                                        const StringRef& right,
261
                                        const std::vector<size_t>& right_offsets, Int64& result,
262
17
                                        size_t row) {
263
17
        if (left_offsets.size() != right_offsets.size()) {
264
4
            return Status::InvalidArgument(
265
4
                    "hamming_distance requires strings of the same length at row {}", row);
266
4
        }
267
268
13
        Int64 distance = 0;
269
13
        const size_t len = left_offsets.size();
270
31
        for (size_t i = 0; i + 1 < len; ++i) {
271
18
            const size_t left_off = left_offsets[i];
272
18
            const size_t left_next = left_offsets[i + 1];
273
18
            const size_t right_off = right_offsets[i];
274
18
            const size_t right_next = right_offsets[i + 1];
275
18
            distance += static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
276
18
                    left, left_off, left_next, right, right_off, right_next));
277
18
        }
278
13
        if (len > 0) {
279
13
            const size_t left_off = left_offsets[len - 1];
280
13
            const size_t right_off = right_offsets[len - 1];
281
13
            distance += static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
282
13
                    left, left_off, left.size, right, right_off, right.size));
283
13
        }
284
285
13
        result = distance;
286
13
        return Status::OK();
287
17
    }
288
289
    static Status hamming_distance_with_offsets(
290
            const StringRef& left, std::vector<size_t>& left_offsets, bool left_offsets_ready,
291
            bool left_ascii, const StringRef& right, std::vector<size_t>& right_offsets,
292
41
            bool right_offsets_ready, bool right_ascii, Int64& result, size_t row) {
293
41
        if (left_ascii && right_ascii) {
294
24
            return hamming_distance_ascii(left, right, result, row);
295
24
        }
296
297
17
        if (!left_offsets_ready) {
298
15
            simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
299
15
        }
300
17
        if (!right_offsets_ready) {
301
13
            simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
302
13
        }
303
17
        return hamming_distance_utf8(left, left_offsets, right, right_offsets, result, row);
304
41
    }
305
306
    static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result,
307
6
                                   size_t row) {
308
6
        std::vector<size_t> left_offsets;
309
6
        std::vector<size_t> right_offsets;
310
6
        return hamming_distance_with_offsets(
311
6
                left, left_offsets, false, simd::VStringFunctions::is_ascii(left), right,
312
6
                right_offsets, false, simd::VStringFunctions::is_ascii(right), result, row);
313
6
    }
314
};
315
316
8
void register_function_hamming_distance(SimpleFunctionFactory& factory) {
317
8
    factory.register_function<FunctionHammingDistance>();
318
8
}
319
320
} // namespace doris