Coverage Report

Created: 2026-03-13 09:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_soundex.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <cctype>
19
20
#include "common/status.h"
21
#include "core/column/column_string.h"
22
#include "core/data_type/data_type_string.h"
23
#include "exprs/function/function.h"
24
#include "exprs/function/simple_function_factory.h"
25
26
namespace doris {
27
#include "common/compile_check_begin.h"
28
29
class FunctionSoundex : public IFunction {
30
public:
31
    static constexpr auto name = "soundex";
32
33
11
    static FunctionPtr create() { return std::make_shared<FunctionSoundex>(); }
34
35
1
    String get_name() const override { return name; }
36
37
2
    size_t get_number_of_arguments() const override { return 1; }
38
39
2
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
40
2
        return std::make_shared<DataTypeString>();
41
2
    }
42
43
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
44
2
                        uint32_t result, size_t input_rows_count) const override {
45
2
        const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column;
46
47
2
        auto res_column = ColumnString::create();
48
2
        res_column->reserve(input_rows_count);
49
2
        auto& res_data = res_column->get_chars();
50
2
        auto& res_offsets = res_column->get_offsets();
51
2
        res_data.reserve(input_rows_count * CODE_SIZE);
52
2
        res_offsets.resize(input_rows_count);
53
43
        for (size_t i = 0; i < input_rows_count; ++i) {
54
42
            StringRef ref = col_ptr->get_data_at(i);
55
42
            RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_data, res_offsets, i));
56
42
        }
57
58
1
        block.replace_by_position(result, std::move(res_column));
59
1
        return Status::OK();
60
2
    }
61
62
private:
63
    Status calculate_soundex_and_insert(const StringRef& ref, ColumnString::Chars& chars,
64
42
                                        ColumnString::Offsets& offsets, const size_t row) const {
65
42
        uint32_t row_start = (row == 0) ? 0 : offsets[row - 1];
66
42
        uint32_t expect_end = row_start + CODE_SIZE;
67
68
42
        if (ref.size == 0) {
69
1
            offsets[row] = row_start;
70
1
            return Status::OK();
71
1
        }
72
73
41
        char pre_code = '\0';
74
246
        for (size_t i = 0; i < ref.size; ++i) {
75
227
            auto c = static_cast<unsigned char>(ref.data[i]);
76
77
227
            if (c > 0x7f) {
78
1
                return Status::InvalidArgument("soundex only supports ASCII, but got: {}",
79
1
                                               ref.data[i]);
80
1
            }
81
226
            if (!std::isalpha(c)) {
82
21
                continue;
83
21
            }
84
85
205
            c = static_cast<char>(std::toupper(c));
86
205
            if (chars.size() == row_start) {
87
37
                chars.push_back(c);
88
37
                pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A'];
89
168
            } else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') {
90
158
                if (code != 'V' && code != pre_code) {
91
84
                    chars.push_back(code);
92
84
                    if (chars.size() == expect_end) {
93
21
                        offsets[row] = static_cast<ColumnString::Offset>(chars.size());
94
21
                        return Status::OK();
95
21
                    }
96
84
                }
97
98
137
                pre_code = code;
99
137
            }
100
205
        }
101
102
46
        while (chars.size() != row_start && chars.size() < expect_end) {
103
27
            chars.push_back('0');
104
27
        }
105
19
        offsets[row] = static_cast<ColumnString::Offset>(chars.size());
106
107
19
        return Status::OK();
108
41
    }
109
110
    /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code
111
     *  the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels.
112
     *  eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K). 
113
     *  Since the vowel "A" separates the Z and K, the K is coded.
114
     *
115
     *  2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded.
116
     *  Here we use 'N' to represent these two characters.
117
     *  eg : **Ashcraft** is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226.
118
     */
119
    static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V',
120
                                               '2', '2', '4', '5', '5', 'V', '1', '2', '6',
121
                                               '2', '3', 'V', '1', 'N', '2', 'V', '2'};
122
123
    static constexpr uint8_t CODE_SIZE = 4;
124
};
125
126
8
void register_function_soundex(SimpleFunctionFactory& factory) {
127
8
    factory.register_function<FunctionSoundex>();
128
8
}
129
130
#include "common/compile_check_end.h"
131
} // namespace doris