Coverage Report

Created: 2026-04-10 04:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_soundex.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <cctype>
19
20
#include "common/status.h"
21
#include "core/column/column_string.h"
22
#include "core/data_type/data_type_string.h"
23
#include "exprs/function/function.h"
24
#include "exprs/function/simple_function_factory.h"
25
26
namespace doris {
27
28
class FunctionSoundex : public IFunction {
29
public:
30
    static constexpr auto name = "soundex";
31
32
61
    static FunctionPtr create() { return std::make_shared<FunctionSoundex>(); }
33
34
1
    String get_name() const override { return name; }
35
36
52
    size_t get_number_of_arguments() const override { return 1; }
37
38
52
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
39
52
        return std::make_shared<DataTypeString>();
40
52
    }
41
42
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
43
50
                        uint32_t result, size_t input_rows_count) const override {
44
50
        const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column;
45
46
50
        auto res_column = ColumnString::create();
47
50
        res_column->reserve(input_rows_count);
48
50
        auto& res_data = res_column->get_chars();
49
50
        auto& res_offsets = res_column->get_offsets();
50
50
        res_data.reserve(input_rows_count * CODE_SIZE);
51
50
        res_offsets.resize(input_rows_count);
52
139
        for (size_t i = 0; i < input_rows_count; ++i) {
53
90
            StringRef ref = col_ptr->get_data_at(i);
54
90
            RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_data, res_offsets, i));
55
90
        }
56
57
49
        block.replace_by_position(result, std::move(res_column));
58
49
        return Status::OK();
59
50
    }
60
61
private:
62
    Status calculate_soundex_and_insert(const StringRef& ref, ColumnString::Chars& chars,
63
90
                                        ColumnString::Offsets& offsets, const size_t row) const {
64
90
        uint32_t row_start = (row == 0) ? 0 : offsets[row - 1];
65
90
        uint32_t expect_end = row_start + CODE_SIZE;
66
67
90
        if (ref.size == 0) {
68
4
            offsets[row] = row_start;
69
4
            return Status::OK();
70
4
        }
71
72
86
        char pre_code = '\0';
73
532
        for (size_t i = 0; i < ref.size; ++i) {
74
490
            auto c = static_cast<unsigned char>(ref.data[i]);
75
76
490
            if (c > 0x7f) {
77
1
                return Status::InvalidArgument("soundex only supports ASCII, but got: {}",
78
1
                                               ref.data[i]);
79
1
            }
80
489
            if (!std::isalpha(c)) {
81
60
                continue;
82
60
            }
83
84
429
            c = static_cast<char>(std::toupper(c));
85
429
            if (chars.size() == row_start) {
86
78
                chars.push_back(c);
87
78
                pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A'];
88
351
            } else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') {
89
330
                if (code != 'V' && code != pre_code) {
90
177
                    chars.push_back(code);
91
177
                    if (chars.size() == expect_end) {
92
43
                        offsets[row] = static_cast<ColumnString::Offset>(chars.size());
93
43
                        return Status::OK();
94
43
                    }
95
177
                }
96
97
287
                pre_code = code;
98
287
            }
99
429
        }
100
101
99
        while (chars.size() != row_start && chars.size() < expect_end) {
102
57
            chars.push_back('0');
103
57
        }
104
42
        offsets[row] = static_cast<ColumnString::Offset>(chars.size());
105
106
42
        return Status::OK();
107
86
    }
108
109
    /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code
110
     *  the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels.
111
     *  eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K). 
112
     *  Since the vowel "A" separates the Z and K, the K is coded.
113
     *
114
     *  2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded.
115
     *  Here we use 'N' to represent these two characters.
116
     *  eg : **Ashcraft** is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226.
117
     */
118
    static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V',
119
                                               '2', '2', '4', '5', '5', 'V', '1', '2', '6',
120
                                               '2', '3', 'V', '1', 'N', '2', 'V', '2'};
121
122
    static constexpr uint8_t CODE_SIZE = 4;
123
};
124
125
8
void register_function_soundex(SimpleFunctionFactory& factory) {
126
8
    factory.register_function<FunctionSoundex>();
127
8
}
128
129
} // namespace doris