Coverage Report

Created: 2026-05-30 03:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_levenshtein.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <algorithm>
19
#include <vector>
20
21
#include "common/status.h"
22
#include "core/data_type/data_type_number.h"
23
#include "core/string_ref.h"
24
#include "exprs/function/function_totype.h"
25
#include "exprs/function/simple_function_factory.h"
26
#include "util/simd/vstring_function.h"
27
28
namespace doris {
29
30
struct NameLevenshtein {
31
    static constexpr auto name = "levenshtein";
32
};
33
34
template <typename LeftDataType, typename RightDataType>
35
struct LevenshteinImpl {
36
    using ResultDataType = DataTypeInt32;
37
    using ResultPaddedPODArray = PaddedPODArray<Int32>;
38
39
    static Status vector_vector(const ColumnString::Chars& ldata,
40
                                const ColumnString::Offsets& loffsets,
41
                                const ColumnString::Chars& rdata,
42
0
                                const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) {
43
0
        DCHECK_EQ(loffsets.size(), roffsets.size());
44
45
0
        const size_t size = loffsets.size();
46
0
        res.resize(size);
47
0
        std::vector<size_t> left_offsets;
48
0
        std::vector<size_t> right_offsets;
49
0
        for (size_t i = 0; i < size; ++i) {
50
0
            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i),
51
0
                                          string_ref_at(rdata, roffsets, i), left_offsets,
52
0
                                          right_offsets);
53
0
        }
54
0
        return Status::OK();
55
0
    }
56
57
    static Status vector_scalar(const ColumnString::Chars& ldata,
58
                                const ColumnString::Offsets& loffsets, const StringRef& rdata,
59
0
                                ResultPaddedPODArray& res) {
60
0
        const size_t size = loffsets.size();
61
0
        res.resize(size);
62
0
        const auto right = rdata.trim_tail_padding_zero();
63
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
64
0
        std::vector<size_t> right_offsets;
65
0
        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
66
0
        std::vector<size_t> left_offsets;
67
0
        for (size_t i = 0; i < size; ++i) {
68
0
            res[i] = levenshtein_distance_with_right_offsets(string_ref_at(ldata, loffsets, i),
69
0
                                                             left_offsets, right, right_offsets,
70
0
                                                             right_ascii);
71
0
        }
72
0
        return Status::OK();
73
0
    }
74
75
    static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata,
76
0
                                const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) {
77
0
        const size_t size = roffsets.size();
78
0
        res.resize(size);
79
0
        const auto left = ldata.trim_tail_padding_zero();
80
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
81
0
        std::vector<size_t> left_offsets;
82
0
        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
83
0
        std::vector<size_t> right_offsets;
84
0
        for (size_t i = 0; i < size; ++i) {
85
0
            res[i] = levenshtein_distance_with_left_offsets(left, left_offsets, left_ascii,
86
0
                                                            string_ref_at(rdata, roffsets, i),
87
0
                                                            right_offsets);
88
0
        }
89
0
        return Status::OK();
90
0
    }
91
92
private:
93
    static StringRef string_ref_at(const ColumnString::Chars& data,
94
0
                                   const ColumnString::Offsets& offsets, size_t i) {
95
0
        DCHECK_LT(i, offsets.size());
96
0
        const auto idx = static_cast<ssize_t>(i);
97
0
        return StringRef(data.data() + offsets[idx - 1], offsets[idx] - offsets[idx - 1])
98
0
                .trim_tail_padding_zero();
99
0
    }
100
101
    static Int32 levenshtein_distance_utf8(const StringRef& left,
102
                                           const std::vector<size_t>& left_offsets,
103
                                           const StringRef& right,
104
0
                                           const std::vector<size_t>& right_offsets) {
105
0
        const StringRef* left_ref = &left;
106
0
        const StringRef* right_ref = &right;
107
0
        const std::vector<size_t>* left_offsets_ref = &left_offsets;
108
0
        const std::vector<size_t>* right_offsets_ref = &right_offsets;
109
0
        if (right_offsets_ref->size() > left_offsets_ref->size()) {
110
0
            std::swap(left_offsets_ref, right_offsets_ref);
111
0
            std::swap(left_ref, right_ref);
112
0
        }
113
114
0
        const size_t m = left_offsets_ref->size();
115
0
        const size_t n = right_offsets_ref->size();
116
117
0
        std::vector<Int32> prev(n + 1);
118
0
        std::vector<Int32> curr(n + 1);
119
0
        for (size_t j = 0; j <= n; ++j) {
120
0
            prev[j] = static_cast<Int32>(j);
121
0
        }
122
123
0
        for (size_t i = 1; i <= m; ++i) {
124
0
            curr[0] = static_cast<Int32>(i);
125
0
            const size_t left_off = (*left_offsets_ref)[i - 1];
126
0
            const size_t left_next = i < m ? (*left_offsets_ref)[i] : left_ref->size;
127
128
0
            for (size_t j = 1; j <= n; ++j) {
129
0
                const size_t right_off = (*right_offsets_ref)[j - 1];
130
0
                const size_t right_next = j < n ? (*right_offsets_ref)[j] : right_ref->size;
131
132
0
                const Int32 cost =
133
0
                        simd::VStringFunctions::utf8_char_equal(*left_ref, left_off, left_next,
134
0
                                                                *right_ref, right_off, right_next)
135
0
                                ? 0
136
0
                                : 1;
137
138
0
                const Int32 insert_cost = curr[j - 1] + 1;
139
0
                const Int32 delete_cost = prev[j] + 1;
140
0
                const Int32 replace_cost = prev[j - 1] + cost;
141
0
                curr[j] = std::min(std::min(insert_cost, delete_cost), replace_cost);
142
0
            }
143
0
            std::swap(prev, curr);
144
0
        }
145
146
0
        return prev[n];
147
0
    }
148
149
0
    static Int32 levenshtein_distance_ascii(const StringRef& left, const StringRef& right) {
150
0
        const StringRef* left_ref = &left;
151
0
        const StringRef* right_ref = &right;
152
0
        size_t m = left.size;
153
0
        size_t n = right.size;
154
155
0
        if (n > m) {
156
0
            std::swap(left_ref, right_ref);
157
0
            std::swap(m, n);
158
0
        }
159
160
0
        std::vector<Int32> prev(n + 1);
161
0
        std::vector<Int32> curr(n + 1);
162
0
        for (size_t j = 0; j <= n; ++j) {
163
0
            prev[j] = static_cast<Int32>(j);
164
0
        }
165
166
0
        for (size_t i = 1; i <= m; ++i) {
167
0
            curr[0] = static_cast<Int32>(i);
168
0
            const char left_char = left_ref->data[i - 1];
169
170
0
            for (size_t j = 1; j <= n; ++j) {
171
0
                const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1;
172
0
                const Int32 insert_cost = curr[j - 1] + 1;
173
0
                const Int32 delete_cost = prev[j] + 1;
174
0
                const Int32 replace_cost = prev[j - 1] + cost;
175
0
                curr[j] = std::min(std::min(insert_cost, delete_cost), replace_cost);
176
0
            }
177
0
            std::swap(prev, curr);
178
0
        }
179
180
0
        return prev[n];
181
0
    }
182
183
    static Int32 levenshtein_distance(const StringRef& left, const StringRef& right,
184
                                      std::vector<size_t>& left_offsets,
185
0
                                      std::vector<size_t>& right_offsets) {
186
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
187
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
188
0
        if (left_ascii && right_ascii) {
189
0
            return levenshtein_distance_ascii(left, right);
190
0
        }
191
192
0
        if (left.size == 0) {
193
0
            return static_cast<Int32>(simd::VStringFunctions::get_char_len(right.data, right.size));
194
0
        }
195
0
        if (right.size == 0) {
196
0
            return static_cast<Int32>(simd::VStringFunctions::get_char_len(left.data, left.size));
197
0
        }
198
199
0
        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
200
0
        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
201
0
        return levenshtein_distance_utf8(left, left_offsets, right, right_offsets);
202
0
    }
203
204
    static Int32 levenshtein_distance_with_right_offsets(const StringRef& left,
205
                                                         std::vector<size_t>& left_offsets,
206
                                                         const StringRef& right,
207
                                                         const std::vector<size_t>& right_offsets,
208
0
                                                         bool right_ascii) {
209
0
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
210
0
        if (left_ascii && right_ascii) {
211
0
            return levenshtein_distance_ascii(left, right);
212
0
        }
213
214
0
        if (left.size == 0) {
215
0
            return static_cast<Int32>(right_offsets.size());
216
0
        }
217
0
        if (right.size == 0) {
218
0
            return left_ascii ? static_cast<Int32>(left.size)
219
0
                              : static_cast<Int32>(
220
0
                                        simd::VStringFunctions::get_char_len(left.data, left.size));
221
0
        }
222
223
0
        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
224
0
        return levenshtein_distance_utf8(left, left_offsets, right, right_offsets);
225
0
    }
226
227
    static Int32 levenshtein_distance_with_left_offsets(const StringRef& left,
228
                                                        const std::vector<size_t>& left_offsets,
229
                                                        bool left_ascii, const StringRef& right,
230
0
                                                        std::vector<size_t>& right_offsets) {
231
0
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
232
0
        if (left_ascii && right_ascii) {
233
0
            return levenshtein_distance_ascii(left, right);
234
0
        }
235
236
0
        if (left.size == 0) {
237
0
            return static_cast<Int32>(
238
0
                    right_ascii ? right.size
239
0
                                : simd::VStringFunctions::get_char_len(right.data, right.size));
240
0
        }
241
0
        if (right.size == 0) {
242
0
            return static_cast<Int32>(left_offsets.size());
243
0
        }
244
245
0
        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
246
0
        return levenshtein_distance_utf8(left, left_offsets, right, right_offsets);
247
0
    }
248
249
    static Int32 levenshtein_distance(const StringRef& left, const StringRef& right) {
250
        std::vector<size_t> left_offsets;
251
        std::vector<size_t> right_offsets;
252
        return levenshtein_distance(left, right, left_offsets, right_offsets);
253
    }
254
};
255
256
using FunctionLevenshtein =
257
        FunctionBinaryToType<DataTypeString, DataTypeString, LevenshteinImpl, NameLevenshtein>;
258
259
1
void register_function_levenshtein(SimpleFunctionFactory& factory) {
260
1
    factory.register_function<FunctionLevenshtein>();
261
1
}
262
263
} // namespace doris