Coverage Report

Created: 2026-05-29 11:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_levenshtein.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <algorithm>
19
#include <vector>
20
21
#include "common/status.h"
22
#include "core/data_type/data_type_number.h"
23
#include "core/string_ref.h"
24
#include "exprs/function/function_totype.h"
25
#include "exprs/function/simple_function_factory.h"
26
#include "util/simd/vstring_function.h"
27
28
namespace doris {
29
30
struct NameLevenshtein {
31
    static constexpr auto name = "levenshtein";
32
};
33
34
template <typename LeftDataType, typename RightDataType>
35
struct LevenshteinImpl {
36
    using ResultDataType = DataTypeInt32;
37
    using ResultPaddedPODArray = PaddedPODArray<Int32>;
38
39
    static Status vector_vector(const ColumnString::Chars& ldata,
40
                                const ColumnString::Offsets& loffsets,
41
                                const ColumnString::Chars& rdata,
42
18
                                const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) {
43
18
        DCHECK_EQ(loffsets.size(), roffsets.size());
44
45
18
        const size_t size = loffsets.size();
46
18
        res.resize(size);
47
18
        std::vector<size_t> left_offsets;
48
18
        std::vector<size_t> right_offsets;
49
54
        for (size_t i = 0; i < size; ++i) {
50
36
            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i),
51
36
                                          string_ref_at(rdata, roffsets, i), left_offsets,
52
36
                                          right_offsets);
53
36
        }
54
18
        return Status::OK();
55
18
    }
56
57
    static Status vector_scalar(const ColumnString::Chars& ldata,
58
                                const ColumnString::Offsets& loffsets, const StringRef& rdata,
59
8
                                ResultPaddedPODArray& res) {
60
8
        const size_t size = loffsets.size();
61
8
        res.resize(size);
62
8
        const auto right = rdata.trim_tail_padding_zero();
63
8
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
64
8
        std::vector<size_t> right_offsets;
65
8
        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
66
8
        std::vector<size_t> left_offsets;
67
32
        for (size_t i = 0; i < size; ++i) {
68
24
            res[i] = levenshtein_distance_with_right_offsets(string_ref_at(ldata, loffsets, i),
69
24
                                                             left_offsets, right, right_offsets,
70
24
                                                             right_ascii);
71
24
        }
72
8
        return Status::OK();
73
8
    }
74
75
    static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata,
76
8
                                const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) {
77
8
        const size_t size = roffsets.size();
78
8
        res.resize(size);
79
8
        const auto left = ldata.trim_tail_padding_zero();
80
8
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
81
8
        std::vector<size_t> left_offsets;
82
8
        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
83
8
        std::vector<size_t> right_offsets;
84
32
        for (size_t i = 0; i < size; ++i) {
85
24
            res[i] = levenshtein_distance_with_left_offsets(left, left_offsets, left_ascii,
86
24
                                                            string_ref_at(rdata, roffsets, i),
87
24
                                                            right_offsets);
88
24
        }
89
8
        return Status::OK();
90
8
    }
91
92
private:
93
    static StringRef string_ref_at(const ColumnString::Chars& data,
94
120
                                   const ColumnString::Offsets& offsets, size_t i) {
95
120
        DCHECK_LT(i, offsets.size());
96
120
        const auto idx = static_cast<ssize_t>(i);
97
120
        return StringRef(data.data() + offsets[idx - 1], offsets[idx] - offsets[idx - 1])
98
120
                .trim_tail_padding_zero();
99
120
    }
100
101
    static Int32 levenshtein_distance_utf8(const StringRef& left,
102
                                           const std::vector<size_t>& left_offsets,
103
                                           const StringRef& right,
104
23
                                           const std::vector<size_t>& right_offsets) {
105
23
        const StringRef* left_ref = &left;
106
23
        const StringRef* right_ref = &right;
107
23
        const std::vector<size_t>* left_offsets_ref = &left_offsets;
108
23
        const std::vector<size_t>* right_offsets_ref = &right_offsets;
109
23
        if (right_offsets_ref->size() > left_offsets_ref->size()) {
110
4
            std::swap(left_offsets_ref, right_offsets_ref);
111
4
            std::swap(left_ref, right_ref);
112
4
        }
113
114
23
        const size_t m = left_offsets_ref->size();
115
23
        const size_t n = right_offsets_ref->size();
116
117
23
        std::vector<Int32> prev(n + 1);
118
23
        std::vector<Int32> curr(n + 1);
119
102
        for (size_t j = 0; j <= n; ++j) {
120
79
            prev[j] = static_cast<Int32>(j);
121
79
        }
122
123
90
        for (size_t i = 1; i <= m; ++i) {
124
67
            curr[0] = static_cast<Int32>(i);
125
67
            const size_t left_off = (*left_offsets_ref)[i - 1];
126
67
            const size_t left_next = i < m ? (*left_offsets_ref)[i] : left_ref->size;
127
128
235
            for (size_t j = 1; j <= n; ++j) {
129
168
                const size_t right_off = (*right_offsets_ref)[j - 1];
130
168
                const size_t right_next = j < n ? (*right_offsets_ref)[j] : right_ref->size;
131
132
168
                const Int32 cost =
133
168
                        simd::VStringFunctions::utf8_char_equal(*left_ref, left_off, left_next,
134
168
                                                                *right_ref, right_off, right_next)
135
168
                                ? 0
136
168
                                : 1;
137
138
168
                const Int32 insert_cost = curr[j - 1] + 1;
139
168
                const Int32 delete_cost = prev[j] + 1;
140
168
                const Int32 replace_cost = prev[j - 1] + cost;
141
168
                curr[j] = std::min(std::min(insert_cost, delete_cost), replace_cost);
142
168
            }
143
67
            std::swap(prev, curr);
144
67
        }
145
146
23
        return prev[n];
147
23
    }
148
149
42
    static Int32 levenshtein_distance_ascii(const StringRef& left, const StringRef& right) {
150
42
        const StringRef* left_ref = &left;
151
42
        const StringRef* right_ref = &right;
152
42
        size_t m = left.size;
153
42
        size_t n = right.size;
154
155
42
        if (n > m) {
156
14
            std::swap(left_ref, right_ref);
157
14
            std::swap(m, n);
158
14
        }
159
160
42
        std::vector<Int32> prev(n + 1);
161
42
        std::vector<Int32> curr(n + 1);
162
169
        for (size_t j = 0; j <= n; ++j) {
163
127
            prev[j] = static_cast<Int32>(j);
164
127
        }
165
166
205
        for (size_t i = 1; i <= m; ++i) {
167
163
            curr[0] = static_cast<Int32>(i);
168
163
            const char left_char = left_ref->data[i - 1];
169
170
596
            for (size_t j = 1; j <= n; ++j) {
171
433
                const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1;
172
433
                const Int32 insert_cost = curr[j - 1] + 1;
173
433
                const Int32 delete_cost = prev[j] + 1;
174
433
                const Int32 replace_cost = prev[j - 1] + cost;
175
433
                curr[j] = std::min(std::min(insert_cost, delete_cost), replace_cost);
176
433
            }
177
163
            std::swap(prev, curr);
178
163
        }
179
180
42
        return prev[n];
181
42
    }
182
183
    static Int32 levenshtein_distance(const StringRef& left, const StringRef& right,
184
                                      std::vector<size_t>& left_offsets,
185
36
                                      std::vector<size_t>& right_offsets) {
186
36
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
187
36
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
188
36
        if (left_ascii && right_ascii) {
189
18
            return levenshtein_distance_ascii(left, right);
190
18
        }
191
192
18
        if (left.size == 0) {
193
3
            return static_cast<Int32>(simd::VStringFunctions::get_char_len(right.data, right.size));
194
3
        }
195
15
        if (right.size == 0) {
196
2
            return static_cast<Int32>(simd::VStringFunctions::get_char_len(left.data, left.size));
197
2
        }
198
199
13
        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
200
13
        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
201
13
        return levenshtein_distance_utf8(left, left_offsets, right, right_offsets);
202
15
    }
203
204
    static Int32 levenshtein_distance_with_right_offsets(const StringRef& left,
205
                                                         std::vector<size_t>& left_offsets,
206
                                                         const StringRef& right,
207
                                                         const std::vector<size_t>& right_offsets,
208
24
                                                         bool right_ascii) {
209
24
        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
210
24
        if (left_ascii && right_ascii) {
211
12
            return levenshtein_distance_ascii(left, right);
212
12
        }
213
214
12
        if (left.size == 0) {
215
2
            return static_cast<Int32>(right_offsets.size());
216
2
        }
217
10
        if (right.size == 0) {
218
5
            return left_ascii ? static_cast<Int32>(left.size)
219
5
                              : static_cast<Int32>(
220
5
                                        simd::VStringFunctions::get_char_len(left.data, left.size));
221
5
        }
222
223
5
        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
224
5
        return levenshtein_distance_utf8(left, left_offsets, right, right_offsets);
225
10
    }
226
227
    static Int32 levenshtein_distance_with_left_offsets(const StringRef& left,
228
                                                        const std::vector<size_t>& left_offsets,
229
                                                        bool left_ascii, const StringRef& right,
230
24
                                                        std::vector<size_t>& right_offsets) {
231
24
        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
232
24
        if (left_ascii && right_ascii) {
233
12
            return levenshtein_distance_ascii(left, right);
234
12
        }
235
236
12
        if (left.size == 0) {
237
5
            return static_cast<Int32>(
238
5
                    right_ascii ? right.size
239
5
                                : simd::VStringFunctions::get_char_len(right.data, right.size));
240
5
        }
241
7
        if (right.size == 0) {
242
2
            return static_cast<Int32>(left_offsets.size());
243
2
        }
244
245
5
        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
246
5
        return levenshtein_distance_utf8(left, left_offsets, right, right_offsets);
247
7
    }
248
249
    static Int32 levenshtein_distance(const StringRef& left, const StringRef& right) {
250
        std::vector<size_t> left_offsets;
251
        std::vector<size_t> right_offsets;
252
        return levenshtein_distance(left, right, left_offsets, right_offsets);
253
    }
254
};
255
256
using FunctionLevenshtein =
257
        FunctionBinaryToType<DataTypeString, DataTypeString, LevenshteinImpl, NameLevenshtein>;
258
259
8
void register_function_levenshtein(SimpleFunctionFactory& factory) {
260
8
    factory.register_function<FunctionLevenshtein>();
261
8
}
262
263
} // namespace doris