be/src/exprs/function/function_soundex.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <cctype> |
19 | | |
20 | | #include "common/status.h" |
21 | | #include "core/column/column_string.h" |
22 | | #include "core/data_type/data_type_string.h" |
23 | | #include "exprs/function/function.h" |
24 | | #include "exprs/function/simple_function_factory.h" |
25 | | |
26 | | namespace doris { |
27 | | #include "common/compile_check_begin.h" |
28 | | |
29 | | class FunctionSoundex : public IFunction { |
30 | | public: |
31 | | static constexpr auto name = "soundex"; |
32 | | |
33 | 11 | static FunctionPtr create() { return std::make_shared<FunctionSoundex>(); } |
34 | | |
35 | 1 | String get_name() const override { return name; } |
36 | | |
37 | 2 | size_t get_number_of_arguments() const override { return 1; } |
38 | | |
39 | 2 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
40 | 2 | return std::make_shared<DataTypeString>(); |
41 | 2 | } |
42 | | |
43 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
44 | 2 | uint32_t result, size_t input_rows_count) const override { |
45 | 2 | const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column; |
46 | | |
47 | 2 | auto res_column = ColumnString::create(); |
48 | 2 | res_column->reserve(input_rows_count); |
49 | 2 | auto& res_data = res_column->get_chars(); |
50 | 2 | auto& res_offsets = res_column->get_offsets(); |
51 | 2 | res_data.reserve(input_rows_count * CODE_SIZE); |
52 | 2 | res_offsets.resize(input_rows_count); |
53 | 43 | for (size_t i = 0; i < input_rows_count; ++i) { |
54 | 42 | StringRef ref = col_ptr->get_data_at(i); |
55 | 42 | RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_data, res_offsets, i)); |
56 | 42 | } |
57 | | |
58 | 1 | block.replace_by_position(result, std::move(res_column)); |
59 | 1 | return Status::OK(); |
60 | 2 | } |
61 | | |
62 | | private: |
63 | | Status calculate_soundex_and_insert(const StringRef& ref, ColumnString::Chars& chars, |
64 | 42 | ColumnString::Offsets& offsets, const size_t row) const { |
65 | 42 | uint32_t row_start = (row == 0) ? 0 : offsets[row - 1]; |
66 | 42 | uint32_t expect_end = row_start + CODE_SIZE; |
67 | | |
68 | 42 | if (ref.size == 0) { |
69 | 1 | offsets[row] = row_start; |
70 | 1 | return Status::OK(); |
71 | 1 | } |
72 | | |
73 | 41 | char pre_code = '\0'; |
74 | 246 | for (size_t i = 0; i < ref.size; ++i) { |
75 | 227 | auto c = static_cast<unsigned char>(ref.data[i]); |
76 | | |
77 | 227 | if (c > 0x7f) { |
78 | 1 | return Status::InvalidArgument("soundex only supports ASCII, but got: {}", |
79 | 1 | ref.data[i]); |
80 | 1 | } |
81 | 226 | if (!std::isalpha(c)) { |
82 | 21 | continue; |
83 | 21 | } |
84 | | |
85 | 205 | c = static_cast<char>(std::toupper(c)); |
86 | 205 | if (chars.size() == row_start) { |
87 | 37 | chars.push_back(c); |
88 | 37 | pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A']; |
89 | 168 | } else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') { |
90 | 158 | if (code != 'V' && code != pre_code) { |
91 | 84 | chars.push_back(code); |
92 | 84 | if (chars.size() == expect_end) { |
93 | 21 | offsets[row] = static_cast<ColumnString::Offset>(chars.size()); |
94 | 21 | return Status::OK(); |
95 | 21 | } |
96 | 84 | } |
97 | | |
98 | 137 | pre_code = code; |
99 | 137 | } |
100 | 205 | } |
101 | | |
102 | 46 | while (chars.size() != row_start && chars.size() < expect_end) { |
103 | 27 | chars.push_back('0'); |
104 | 27 | } |
105 | 19 | offsets[row] = static_cast<ColumnString::Offset>(chars.size()); |
106 | | |
107 | 19 | return Status::OK(); |
108 | 41 | } |
109 | | |
110 | | /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code |
111 | | * the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels. |
112 | | * eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K). |
113 | | * Since the vowel "A" separates the Z and K, the K is coded. |
114 | | * |
115 | | * 2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded. |
116 | | * Here we use 'N' to represent these two characters. |
117 | | * eg : **Ashcraft** is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226. |
118 | | */ |
119 | | static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V', |
120 | | '2', '2', '4', '5', '5', 'V', '1', '2', '6', |
121 | | '2', '3', 'V', '1', 'N', '2', 'V', '2'}; |
122 | | |
123 | | static constexpr uint8_t CODE_SIZE = 4; |
124 | | }; |
125 | | |
126 | 8 | void register_function_soundex(SimpleFunctionFactory& factory) { |
127 | 8 | factory.register_function<FunctionSoundex>(); |
128 | 8 | } |
129 | | |
130 | | #include "common/compile_check_end.h" |
131 | | } // namespace doris |