Coverage Report

Created: 2026-05-29 17:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_hamming_distance.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <algorithm>
19
#include <vector>
20
21
#include "common/status.h"
22
#include "core/column/column_nullable.h"
23
#include "core/column/column_string.h"
24
#include "core/data_type/data_type_number.h"
25
#include "core/string_ref.h"
26
#include "exprs/function/simple_function_factory.h"
27
#include "util/simd/vstring_function.h"
28
29
namespace doris {
30
31
class FunctionHammingDistance : public IFunction {
32
public:
33
    using ResultDataType = DataTypeInt64;
34
    using ResultPaddedPODArray = PaddedPODArray<Int64>;
35
    using ResultColumnType = ColumnVector<ResultDataType::PType>;
36
37
    static constexpr auto name = "hamming_distance";
38
39
2
    static FunctionPtr create() { return std::make_shared<FunctionHammingDistance>(); }
40
41
1
    String get_name() const override { return name; }
42
0
    size_t get_number_of_arguments() const override { return 2; }
43
44
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
45
0
        const bool has_nullable = std::ranges::any_of(
46
0
                arguments, [](const DataTypePtr& type) { return type->is_nullable(); });
47
0
        if (has_nullable) {
48
0
            return make_nullable(std::make_shared<ResultDataType>());
49
0
        }
50
0
        return std::make_shared<ResultDataType>();
51
0
    }
52
53
0
    bool use_default_implementation_for_nulls() const override { return false; }
54
55
    Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
56
0
                        uint32_t result, size_t input_rows_count) const override {
57
0
        const auto& [left_col, left_const] =
58
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
59
0
        const auto& [right_col, right_const] =
60
0
                unpack_if_const(block.get_by_position(arguments[1]).column);
61
62
0
        const auto* left_nullable = check_and_get_column<ColumnNullable>(left_col.get());
63
0
        const auto* right_nullable = check_and_get_column<ColumnNullable>(right_col.get());
64
65
0
        const IColumn* left_nested =
66
0
                left_nullable ? &left_nullable->get_nested_column() : left_col.get();
67
0
        const IColumn* right_nested =
68
0
                right_nullable ? &right_nullable->get_nested_column() : right_col.get();
69
70
0
        const auto* left_str_col = assert_cast<const ColumnString*>(left_nested);
71
0
        const auto* right_str_col = assert_cast<const ColumnString*>(right_nested);
72
73
0
        auto res_col = ResultColumnType::create(input_rows_count);
74
0
        auto& res_data = res_col->get_data();
75
76
0
        const NullMap* left_null_map =
77
0
                left_nullable ? &left_nullable->get_null_map_data() : nullptr;
78
0
        const NullMap* right_null_map =
79
0
                right_nullable ? &right_nullable->get_null_map_data() : nullptr;
80
0
        const bool has_nullable = left_null_map != nullptr || right_null_map != nullptr;
81
82
0
        if (!has_nullable) {
83
0
            if (left_const) {
84
0
                RETURN_IF_ERROR(scalar_vector(left_str_col->get_data_at(0).trim_tail_padding_zero(),
85
0
                                              *right_str_col, res_data));
86
0
            } else if (right_const) {
87
0
                RETURN_IF_ERROR(vector_scalar(
88
0
                        *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(),
89
0
                        res_data));
90
0
            } else {
91
0
                RETURN_IF_ERROR(vector_vector(*left_str_col, *right_str_col, res_data));
92
0
            }
93
0
            block.replace_by_position(result, std::move(res_col));
94
0
            return Status::OK();
95
0
        }
96
97
0
        auto null_col = ColumnUInt8::create(input_rows_count, 0);
98
0
        auto& null_map = null_col->get_data();
99
0
        if (left_const) {
100
0
            if (left_null_map && (*left_null_map)[0]) {
101
0
                std::fill(null_map.begin(), null_map.end(), 1);
102
0
                block.replace_by_position(
103
0
                        result, ColumnNullable::create(std::move(res_col), std::move(null_col)));
104
0
                return Status::OK();
105
0
            }
106
107
0
            const auto left = left_str_col->get_data_at(0).trim_tail_padding_zero();
108
0
            RETURN_IF_ERROR(scalar_vector_nullable(left, *right_str_col, right_null_map, res_data,
109
0
                                                   null_map));
110
0
        } else if (right_const) {
111
0
            if (right_null_map && (*right_null_map)[0]) {
112
0
                std::fill(null_map.begin(), null_map.end(), 1);
113
0
                block.replace_by_position(
114
0
                        result, ColumnNullable::create(std::move(res_col), std::move(null_col)));
115
0
                return Status::OK();
116
0
            }
117
118
0
            RETURN_IF_ERROR(vector_scalar_nullable(
119
0
                    *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(),
120
0
                    left_null_map, res_data, null_map));
121
0
        } else {
122
0
            for (size_t i = 0; i < input_rows_count; ++i) {
123
0
                const bool left_is_null = left_null_map && (*left_null_map)[i];
124
0
                const bool right_is_null = right_null_map && (*right_null_map)[i];
125
0
                if (left_is_null || right_is_null) {
126
0
                    null_map[i] = 1;
127
0
                    res_data[i] = 0;
128
0
                    continue;
129
0
                }
130
131
0
                RETURN_IF_ERROR(hamming_distance(
132
0
                        left_str_col->get_data_at(i).trim_tail_padding_zero(),
133
0
                        right_str_col->get_data_at(i).trim_tail_padding_zero(), res_data[i], i));
134
0
            }
135
0
        }
136
137
0
        block.replace_by_position(result,
138
0
                                  ColumnNullable::create(std::move(res_col), std::move(null_col)));
139
0
        return Status::OK();
140
0
    }
141
142
private:
143
    static Status vector_vector(const ColumnString& lcol, const ColumnString& rcol,
144
0
                                ResultPaddedPODArray& res) {
145
0
        DCHECK_EQ(lcol.size(), rcol.size());
146
147
0
        const size_t size = lcol.size();
148
0
        res.resize(size);
149
0
        std::vector<size_t> left_offsets;
150
0
        std::vector<size_t> right_offsets;
151
0
        for (size_t i = 0; i < size; ++i) {
152
0
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
153
0
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
154
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
155
0
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), right,
156
0
                    right_offsets, false, simd::VStringFunctions::is_ascii(right), res[i], i));
157
0
        }
158
0
        return Status::OK();
159
0
    }
160
161
    static Status vector_scalar(const ColumnString& lcol, const StringRef& rdata,
162
0
                                ResultPaddedPODArray& res) {
163
0
        const size_t size = lcol.size();
164
0
        res.resize(size);
165
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
166
0
        std::vector<size_t> right_offsets;
167
0
        simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
168
0
        std::vector<size_t> left_offsets;
169
0
        for (size_t i = 0; i < size; ++i) {
170
0
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
171
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
172
0
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata,
173
0
                    right_offsets, true, right_ascii, res[i], i));
174
0
        }
175
0
        return Status::OK();
176
0
    }
177
178
    static Status scalar_vector(const StringRef& ldata, const ColumnString& rcol,
179
0
                                ResultPaddedPODArray& res) {
180
0
        const size_t size = rcol.size();
181
0
        res.resize(size);
182
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
183
0
        std::vector<size_t> left_offsets;
184
0
        simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
185
0
        std::vector<size_t> right_offsets;
186
0
        for (size_t i = 0; i < size; ++i) {
187
0
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
188
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
189
0
                    ldata, left_offsets, true, left_ascii, right, right_offsets, false,
190
0
                    simd::VStringFunctions::is_ascii(right), res[i], i));
191
0
        }
192
0
        return Status::OK();
193
0
    }
194
195
    static Status vector_scalar_nullable(const ColumnString& lcol, const StringRef& rdata,
196
                                         const NullMap* left_null_map, ResultPaddedPODArray& res,
197
0
                                         NullMap& null_map) {
198
0
        const size_t size = lcol.size();
199
0
        res.resize(size);
200
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
201
0
        std::vector<size_t> right_offsets;
202
0
        simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
203
0
        std::vector<size_t> left_offsets;
204
0
        for (size_t i = 0; i < size; ++i) {
205
0
            if (left_null_map && (*left_null_map)[i]) {
206
0
                null_map[i] = 1;
207
0
                res[i] = 0;
208
0
                continue;
209
0
            }
210
211
0
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
212
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
213
0
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata,
214
0
                    right_offsets, true, right_ascii, res[i], i));
215
0
        }
216
0
        return Status::OK();
217
0
    }
218
219
    static Status scalar_vector_nullable(const StringRef& ldata, const ColumnString& rcol,
220
                                         const NullMap* right_null_map, ResultPaddedPODArray& res,
221
0
                                         NullMap& null_map) {
222
0
        const size_t size = rcol.size();
223
0
        res.resize(size);
224
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
225
0
        std::vector<size_t> left_offsets;
226
0
        simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
227
0
        std::vector<size_t> right_offsets;
228
0
        for (size_t i = 0; i < size; ++i) {
229
0
            if (right_null_map && (*right_null_map)[i]) {
230
0
                null_map[i] = 1;
231
0
                res[i] = 0;
232
0
                continue;
233
0
            }
234
235
0
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
236
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
237
0
                    ldata, left_offsets, true, left_ascii, right, right_offsets, false,
238
0
                    simd::VStringFunctions::is_ascii(right), res[i], i));
239
0
        }
240
0
        return Status::OK();
241
0
    }
242
243
    static Status hamming_distance_ascii(const StringRef& left, const StringRef& right,
244
0
                                         Int64& result, size_t row) {
245
0
        if (left.size != right.size) {
246
0
            return Status::InvalidArgument(
247
0
                    "hamming_distance requires strings of the same length at row {}", row);
248
0
        }
249
250
0
        Int64 distance = 0;
251
0
        for (size_t i = 0; i < left.size; ++i) {
252
0
            distance += static_cast<Int64>(left.data[i] != right.data[i]);
253
0
        }
254
0
        result = distance;
255
0
        return Status::OK();
256
0
    }
257
258
    static Status hamming_distance_utf8(const StringRef& left,
259
                                        const std::vector<size_t>& left_offsets,
260
                                        const StringRef& right,
261
                                        const std::vector<size_t>& right_offsets, Int64& result,
262
0
                                        size_t row) {
263
0
        if (left_offsets.size() != right_offsets.size()) {
264
0
            return Status::InvalidArgument(
265
0
                    "hamming_distance requires strings of the same length at row {}", row);
266
0
        }
267
268
0
        Int64 distance = 0;
269
0
        const size_t len = left_offsets.size();
270
0
        for (size_t i = 0; i + 1 < len; ++i) {
271
0
            const size_t left_off = left_offsets[i];
272
0
            const size_t left_next = left_offsets[i + 1];
273
0
            const size_t right_off = right_offsets[i];
274
0
            const size_t right_next = right_offsets[i + 1];
275
0
            distance += static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
276
0
                    left, left_off, left_next, right, right_off, right_next));
277
0
        }
278
0
        if (len > 0) {
279
0
            const size_t left_off = left_offsets[len - 1];
280
0
            const size_t right_off = right_offsets[len - 1];
281
0
            distance += static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
282
0
                    left, left_off, left.size, right, right_off, right.size));
283
0
        }
284
285
0
        result = distance;
286
0
        return Status::OK();
287
0
    }
288
289
    static Status hamming_distance_with_offsets(
290
            const StringRef& left, std::vector<size_t>& left_offsets, bool left_offsets_ready,
291
            bool left_ascii, const StringRef& right, std::vector<size_t>& right_offsets,
292
0
            bool right_offsets_ready, bool right_ascii, Int64& result, size_t row) {
293
0
        if (left_ascii && right_ascii) {
294
0
            return hamming_distance_ascii(left, right, result, row);
295
0
        }
296
297
0
        if (!left_offsets_ready) {
298
0
            simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
299
0
        }
300
0
        if (!right_offsets_ready) {
301
0
            simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
302
0
        }
303
0
        return hamming_distance_utf8(left, left_offsets, right, right_offsets, result, row);
304
0
    }
305
306
    static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result,
307
0
                                   size_t row) {
308
0
        std::vector<size_t> left_offsets;
309
0
        std::vector<size_t> right_offsets;
310
0
        return hamming_distance_with_offsets(
311
0
                left, left_offsets, false, simd::VStringFunctions::is_ascii(left), right,
312
0
                right_offsets, false, simd::VStringFunctions::is_ascii(right), result, row);
313
0
    }
314
};
315
316
1
void register_function_hamming_distance(SimpleFunctionFactory& factory) {
317
1
    factory.register_function<FunctionHammingDistance>();
318
1
}
319
320
} // namespace doris