Coverage Report

Created: 2026-04-09 05:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_hamming_distance.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <algorithm>
19
#include <cstring>
20
#include <vector>
21
22
#include "common/status.h"
23
#include "core/column/column_nullable.h"
24
#include "core/column/column_string.h"
25
#include "core/data_type/data_type_number.h"
26
#include "core/string_ref.h"
27
#include "exprs/function/simple_function_factory.h"
28
#include "util/simd/vstring_function.h"
29
30
namespace doris {
31
#include "common/compile_check_begin.h"
32
33
class FunctionHammingDistance : public IFunction {
34
public:
35
    using ResultDataType = DataTypeInt64;
36
    using ResultPaddedPODArray = PaddedPODArray<Int64>;
37
    using ResultColumnType = ColumnVector<ResultDataType::PType>;
38
39
    static constexpr auto name = "hamming_distance";
40
41
2
    static FunctionPtr create() { return std::make_shared<FunctionHammingDistance>(); }
42
43
1
    String get_name() const override { return name; }
44
0
    size_t get_number_of_arguments() const override { return 2; }
45
46
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
47
0
        const bool has_nullable = std::ranges::any_of(
48
0
                arguments, [](const DataTypePtr& type) { return type->is_nullable(); });
49
0
        if (has_nullable) {
50
0
            return make_nullable(std::make_shared<ResultDataType>());
51
0
        }
52
0
        return std::make_shared<ResultDataType>();
53
0
    }
54
55
0
    bool use_default_implementation_for_nulls() const override { return false; }
56
57
    Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
58
0
                        uint32_t result, size_t input_rows_count) const override {
59
0
        const auto& [left_col, left_const] =
60
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
61
0
        const auto& [right_col, right_const] =
62
0
                unpack_if_const(block.get_by_position(arguments[1]).column);
63
64
0
        const auto* left_nullable = check_and_get_column<ColumnNullable>(left_col.get());
65
0
        const auto* right_nullable = check_and_get_column<ColumnNullable>(right_col.get());
66
67
0
        const IColumn* left_nested =
68
0
                left_nullable ? &left_nullable->get_nested_column() : left_col.get();
69
0
        const IColumn* right_nested =
70
0
                right_nullable ? &right_nullable->get_nested_column() : right_col.get();
71
72
0
        const auto* left_str_col = check_and_get_column<ColumnString>(left_nested);
73
0
        const auto* right_str_col = check_and_get_column<ColumnString>(right_nested);
74
0
        if (!left_str_col || !right_str_col) {
75
0
            return Status::NotSupported("Illegal columns {}, {} of argument of function {}",
76
0
                                        left_col->get_name(), right_col->get_name(), get_name());
77
0
        }
78
79
0
        auto res_col = ResultColumnType::create(input_rows_count);
80
0
        auto& res_data = res_col->get_data();
81
82
0
        const NullMap* left_null_map =
83
0
                left_nullable ? &left_nullable->get_null_map_data() : nullptr;
84
0
        const NullMap* right_null_map =
85
0
                right_nullable ? &right_nullable->get_null_map_data() : nullptr;
86
0
        const bool has_nullable = left_null_map != nullptr || right_null_map != nullptr;
87
88
0
        if (!has_nullable) {
89
0
            if (left_const) {
90
0
                RETURN_IF_ERROR(scalar_vector(left_str_col->get_data_at(0).trim_tail_padding_zero(),
91
0
                                              *right_str_col, res_data));
92
0
            } else if (right_const) {
93
0
                RETURN_IF_ERROR(vector_scalar(
94
0
                        *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(),
95
0
                        res_data));
96
0
            } else {
97
0
                RETURN_IF_ERROR(vector_vector(*left_str_col, *right_str_col, res_data));
98
0
            }
99
0
            block.replace_by_position(result, std::move(res_col));
100
0
            return Status::OK();
101
0
        }
102
103
0
        auto null_col = ColumnUInt8::create(input_rows_count, 0);
104
0
        auto& null_map = null_col->get_data();
105
0
        for (size_t i = 0; i < input_rows_count; ++i) {
106
0
            const size_t left_idx = left_const ? 0 : i;
107
0
            const size_t right_idx = right_const ? 0 : i;
108
109
0
            const bool left_is_null = left_null_map && (*left_null_map)[left_idx];
110
0
            const bool right_is_null = right_null_map && (*right_null_map)[right_idx];
111
0
            if (left_is_null || right_is_null) {
112
0
                null_map[i] = 1;
113
0
                res_data[i] = 0;
114
0
                continue;
115
0
            }
116
117
0
            RETURN_IF_ERROR(
118
0
                    hamming_distance(left_str_col->get_data_at(left_idx).trim_tail_padding_zero(),
119
0
                                     right_str_col->get_data_at(right_idx).trim_tail_padding_zero(),
120
0
                                     res_data[i], i));
121
0
        }
122
123
0
        block.replace_by_position(result,
124
0
                                  ColumnNullable::create(std::move(res_col), std::move(null_col)));
125
0
        return Status::OK();
126
0
    }
127
128
private:
129
    static Status vector_vector(const ColumnString& lcol, const ColumnString& rcol,
130
0
                                ResultPaddedPODArray& res) {
131
0
        DCHECK_EQ(lcol.size(), rcol.size());
132
133
0
        const size_t size = lcol.size();
134
0
        res.resize(size);
135
0
        std::vector<size_t> left_offsets;
136
0
        std::vector<size_t> right_offsets;
137
0
        for (size_t i = 0; i < size; ++i) {
138
0
            RETURN_IF_ERROR(hamming_distance(lcol.get_data_at(i).trim_tail_padding_zero(),
139
0
                                             rcol.get_data_at(i).trim_tail_padding_zero(),
140
0
                                             left_offsets, right_offsets, res[i], i));
141
0
        }
142
0
        return Status::OK();
143
0
    }
144
145
    static Status vector_scalar(const ColumnString& lcol, const StringRef& rdata,
146
0
                                ResultPaddedPODArray& res) {
147
0
        const size_t size = lcol.size();
148
0
        res.resize(size);
149
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
150
0
        std::vector<size_t> right_offsets;
151
0
        utf8_char_offsets(rdata, right_offsets);
152
0
        std::vector<size_t> left_offsets;
153
0
        for (size_t i = 0; i < size; ++i) {
154
0
            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
155
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
156
0
                    left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata,
157
0
                    right_offsets, true, right_ascii, res[i], i));
158
0
        }
159
0
        return Status::OK();
160
0
    }
161
162
    static Status scalar_vector(const StringRef& ldata, const ColumnString& rcol,
163
0
                                ResultPaddedPODArray& res) {
164
0
        const size_t size = rcol.size();
165
0
        res.resize(size);
166
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
167
0
        std::vector<size_t> left_offsets;
168
0
        utf8_char_offsets(ldata, left_offsets);
169
0
        std::vector<size_t> right_offsets;
170
0
        for (size_t i = 0; i < size; ++i) {
171
0
            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
172
0
            RETURN_IF_ERROR(hamming_distance_with_offsets(
173
0
                    ldata, left_offsets, true, left_ascii, right, right_offsets, false,
174
0
                    simd::VStringFunctions::is_ascii(right), res[i], i));
175
0
        }
176
0
        return Status::OK();
177
0
    }
178
179
0
    static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& offsets) {
180
0
        offsets.clear();
181
0
        offsets.reserve(ref.size);
182
0
        simd::VStringFunctions::get_char_len(ref.data, ref.size, offsets);
183
0
    }
184
185
    static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next,
186
0
                                const StringRef& right, size_t right_off, size_t right_next) {
187
0
        const size_t left_len = left_next - left_off;
188
0
        const size_t right_len = right_next - right_off;
189
0
        return left_len == right_len &&
190
0
               std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0;
191
0
    }
192
193
    static Status hamming_distance_ascii(const StringRef& left, const StringRef& right,
194
0
                                         Int64& result, size_t row) {
195
0
        if (left.size != right.size) {
196
0
            return Status::InvalidArgument(
197
0
                    "hamming_distance requires strings of the same length at row {}", row);
198
0
        }
199
200
0
        Int64 distance = 0;
201
0
        for (size_t i = 0; i < left.size; ++i) {
202
0
            distance += static_cast<Int64>(left.data[i] != right.data[i]);
203
0
        }
204
0
        result = distance;
205
0
        return Status::OK();
206
0
    }
207
208
    static Status hamming_distance_utf8(const StringRef& left,
209
                                        const std::vector<size_t>& left_offsets,
210
                                        const StringRef& right,
211
                                        const std::vector<size_t>& right_offsets, Int64& result,
212
0
                                        size_t row) {
213
0
        if (left_offsets.size() != right_offsets.size()) {
214
0
            return Status::InvalidArgument(
215
0
                    "hamming_distance requires strings of the same length at row {}", row);
216
0
        }
217
218
0
        Int64 distance = 0;
219
0
        const size_t len = left_offsets.size();
220
0
        for (size_t i = 0; i + 1 < len; ++i) {
221
0
            const size_t left_off = left_offsets[i];
222
0
            const size_t left_next = left_offsets[i + 1];
223
0
            const size_t right_off = right_offsets[i];
224
0
            const size_t right_next = right_offsets[i + 1];
225
0
            distance += static_cast<Int64>(
226
0
                    !utf8_char_equal(left, left_off, left_next, right, right_off, right_next));
227
0
        }
228
0
        if (len > 0) {
229
0
            const size_t left_off = left_offsets[len - 1];
230
0
            const size_t right_off = right_offsets[len - 1];
231
0
            distance += static_cast<Int64>(
232
0
                    !utf8_char_equal(left, left_off, left.size, right, right_off, right.size));
233
0
        }
234
235
0
        result = distance;
236
0
        return Status::OK();
237
0
    }
238
239
    static Status hamming_distance(const StringRef& left, const StringRef& right,
240
                                   std::vector<size_t>& left_offsets,
241
0
                                   std::vector<size_t>& right_offsets, Int64& result, size_t row) {
242
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
243
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
244
0
        return hamming_distance_with_offsets(left, left_offsets, false, left_ascii, right,
245
0
                                             right_offsets, false, right_ascii, result, row);
246
0
    }
247
248
    static Status hamming_distance_with_offsets(
249
            const StringRef& left, std::vector<size_t>& left_offsets, bool left_offsets_ready,
250
            bool left_ascii, const StringRef& right, std::vector<size_t>& right_offsets,
251
0
            bool right_offsets_ready, bool right_ascii, Int64& result, size_t row) {
252
0
        if (left_ascii && right_ascii) {
253
0
            return hamming_distance_ascii(left, right, result, row);
254
0
        }
255
256
0
        if (!left_offsets_ready) {
257
0
            utf8_char_offsets(left, left_offsets);
258
0
        }
259
0
        if (!right_offsets_ready) {
260
0
            utf8_char_offsets(right, right_offsets);
261
0
        }
262
0
        return hamming_distance_utf8(left, left_offsets, right, right_offsets, result, row);
263
0
    }
264
265
    static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result,
266
0
                                   size_t row) {
267
0
        std::vector<size_t> left_offsets;
268
0
        std::vector<size_t> right_offsets;
269
0
        return hamming_distance(left, right, left_offsets, right_offsets, result, row);
270
0
    }
271
};
272
273
1
void register_function_hamming_distance(SimpleFunctionFactory& factory) {
274
1
    factory.register_function<FunctionHammingDistance>();
275
1
}
276
277
#include "common/compile_check_end.h"
278
} // namespace doris