be/src/exprs/function/function_tokenize.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/function/function_tokenize.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | #include <rapidjson/prettywriter.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <boost/regex.hpp> |
25 | | #include <memory> |
26 | | #include <utility> |
27 | | |
28 | | #include "CLucene/StdHeader.h" |
29 | | #include "CLucene/config/repl_wchar.h" |
30 | | #include "core/block/block.h" |
31 | | #include "core/block/column_with_type_and_name.h" |
32 | | #include "core/column/column.h" |
33 | | #include "core/data_type/data_type_nullable.h" |
34 | | #include "core/data_type/data_type_number.h" |
35 | | #include "core/string_ref.h" |
36 | | #include "storage/index/inverted/analyzer/analyzer.h" |
37 | | #include "storage/index/inverted/inverted_index_parser.h" |
38 | | #include "storage/index/inverted/inverted_index_reader.h" |
39 | | |
40 | | namespace doris { |
41 | | #include "common/compile_check_begin.h" |
42 | | using namespace doris::segment_v2::inverted_index; |
43 | | |
44 | 150 | Status parse(const std::string& str, std::map<std::string, std::string>& result) { |
45 | 150 | boost::regex pattern( |
46 | 150 | R"delimiter((?:'([^']*)'|"([^"]*)"|([^, ]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^, ]*)))delimiter"); |
47 | 150 | boost::smatch matches; |
48 | | |
49 | 150 | std::string::const_iterator searchStart(str.cbegin()); |
50 | 290 | while (boost::regex_search(searchStart, str.cend(), matches, pattern)) { |
51 | 140 | std::string key = matches[1].length() |
52 | 140 | ? matches[1].str() |
53 | 140 | : (matches[2].length() ? matches[2].str() : matches[3].str()); |
54 | 140 | std::string value = matches[4].length() |
55 | 140 | ? matches[4].str() |
56 | 140 | : (matches[5].length() ? matches[5].str() : matches[6].str()); |
57 | | |
58 | 140 | result[key] = value; |
59 | | |
60 | 140 | searchStart = matches.suffix().first; |
61 | 140 | } |
62 | | |
63 | 150 | return Status::OK(); |
64 | 150 | } |
65 | | |
66 | | void FunctionTokenize::_do_tokenize_none(const ColumnString& src_column_string, |
67 | 40 | const MutableColumnPtr& dest_column_ptr) const { |
68 | 40 | ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size(); |
69 | 201 | for (size_t i = 0; i < src_offsets_size; i++) { |
70 | 161 | const StringRef tokenize_str = src_column_string.get_data_at(i); |
71 | | |
72 | 161 | rapidjson::Document doc; |
73 | 161 | doc.SetArray(); |
74 | 161 | rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); |
75 | | |
76 | 161 | rapidjson::Value obj(rapidjson::kObjectType); |
77 | 161 | obj.AddMember( |
78 | 161 | "token", |
79 | 161 | rapidjson::Value(tokenize_str.data, |
80 | 161 | static_cast<rapidjson::SizeType>(tokenize_str.size), allocator) |
81 | 161 | .Move(), |
82 | 161 | allocator); |
83 | 161 | doc.PushBack(obj, allocator); |
84 | | |
85 | 161 | rapidjson::StringBuffer buffer; |
86 | 161 | rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer); |
87 | 161 | writer.SetFormatOptions(rapidjson::kFormatSingleLineArray); |
88 | 161 | doc.Accept(writer); |
89 | 161 | const std::string json_array_str = buffer.GetString(); |
90 | | |
91 | 161 | dest_column_ptr->insert_data(json_array_str.data(), json_array_str.size()); |
92 | 161 | } |
93 | 40 | } |
94 | | |
95 | | void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, |
96 | | const InvertedIndexAnalyzerCtx& analyzer_ctx, |
97 | | bool support_phrase, |
98 | 108 | const MutableColumnPtr& dest_column_ptr) const { |
99 | 108 | ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size(); |
100 | 227 | for (size_t i = 0; i < src_offsets_size; i++) { |
101 | 119 | const StringRef tokenize_str = src_column_string.get_data_at(i); |
102 | 119 | if (tokenize_str.size == 0) { |
103 | 1 | dest_column_ptr->insert_data("", 0); |
104 | 1 | continue; |
105 | 1 | } |
106 | | |
107 | 118 | auto reader = InvertedIndexAnalyzer::create_reader(analyzer_ctx.char_filter_map); |
108 | 118 | reader->init(tokenize_str.data, (int)tokenize_str.size, true); |
109 | 118 | auto analyzer_tokens = |
110 | 118 | InvertedIndexAnalyzer::get_analyse_result(reader, analyzer_ctx.analyzer.get()); |
111 | | |
112 | 118 | rapidjson::Document doc; |
113 | 118 | doc.SetArray(); |
114 | 118 | rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); |
115 | 595 | for (const auto& analyzer_token : analyzer_tokens) { |
116 | 595 | rapidjson::Value obj(rapidjson::kObjectType); |
117 | 595 | obj.AddMember( |
118 | 595 | "token", |
119 | 595 | rapidjson::Value(analyzer_token.get_single_term().c_str(), allocator).Move(), |
120 | 595 | allocator); |
121 | 595 | if (support_phrase) { |
122 | 0 | obj.AddMember("position", analyzer_token.position, allocator); |
123 | 0 | } |
124 | 595 | doc.PushBack(obj, allocator); |
125 | 595 | } |
126 | 118 | rapidjson::StringBuffer buffer; |
127 | 118 | rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer); |
128 | 118 | writer.SetFormatOptions(rapidjson::kFormatSingleLineArray); |
129 | 118 | doc.Accept(writer); |
130 | 118 | const std::string json_array_str = buffer.GetString(); |
131 | | |
132 | 118 | dest_column_ptr->insert_data(json_array_str.data(), json_array_str.size()); |
133 | 118 | } |
134 | 108 | } |
135 | | |
136 | | Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block, |
137 | | const ColumnNumbers& arguments, uint32_t result, |
138 | 150 | size_t /*input_rows_count*/) const { |
139 | 150 | DCHECK_EQ(arguments.size(), 2); |
140 | 150 | const auto& [src_column, left_const] = |
141 | 150 | unpack_if_const(block.get_by_position(arguments[0]).column); |
142 | 150 | const auto& [right_column, right_const] = |
143 | 150 | unpack_if_const(block.get_by_position(arguments[1]).column); |
144 | | |
145 | 150 | auto dest_column_type = std::make_shared<DataTypeString>(); |
146 | 150 | auto dest_column_ptr = dest_column_type->create_column(); |
147 | | |
148 | 150 | if (const auto* col_left = check_and_get_column<ColumnString>(src_column.get())) { |
149 | 150 | if (const auto* col_right = check_and_get_column<ColumnString>(right_column.get())) { |
150 | 150 | std::map<std::string, std::string> properties; |
151 | 150 | auto st = parse(col_right->get_data_at(0).to_string(), properties); |
152 | 150 | if (!st.ok()) { |
153 | 0 | return st; |
154 | 0 | } |
155 | 150 | InvertedIndexAnalyzerConfig config; |
156 | 150 | config.analyzer_name = get_analyzer_name_from_properties(properties); |
157 | 150 | config.parser_type = get_inverted_index_parser_type_from_string( |
158 | 150 | get_parser_string_from_properties(properties)); |
159 | 150 | if (config.parser_type == InvertedIndexParserType::PARSER_UNKNOWN) { |
160 | 2 | return Status::Error<doris::ErrorCode::INDEX_INVALID_PARAMETERS>( |
161 | 2 | "unsupported parser type. currently, only 'english', 'chinese', " |
162 | 2 | "'unicode', 'icu', 'basic' and 'ik' analyzers are supported."); |
163 | 2 | } |
164 | | |
165 | | // Special handling for PARSER_NONE: return original string as single token |
166 | 148 | if (config.analyzer_name.empty() && |
167 | 148 | config.parser_type == InvertedIndexParserType::PARSER_NONE) { |
168 | 40 | _do_tokenize_none(*col_left, dest_column_ptr); |
169 | 40 | block.replace_by_position(result, std::move(dest_column_ptr)); |
170 | 40 | return Status::OK(); |
171 | 40 | } |
172 | | |
173 | 108 | config.parser_mode = get_parser_mode_string_from_properties(properties); |
174 | 108 | config.char_filter_map = get_parser_char_filter_map_from_properties(properties); |
175 | 108 | config.lower_case = get_parser_lowercase_from_properties(properties); |
176 | 108 | config.stop_words = get_parser_stopwords_from_properties(properties); |
177 | 108 | bool support_phrase = get_parser_phrase_support_string_from_properties(properties) == |
178 | 108 | INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES; |
179 | | |
180 | 108 | std::shared_ptr<lucene::analysis::Analyzer> analyzer_holder; |
181 | 108 | try { |
182 | 108 | analyzer_holder = |
183 | 108 | doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer( |
184 | 108 | &config); |
185 | 108 | } catch (CLuceneError& e) { |
186 | 0 | return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>( |
187 | 0 | "inverted index create analyzer failed: {}", e.what()); |
188 | 0 | } catch (Exception& e) { |
189 | 0 | return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>( |
190 | 0 | "inverted index create analyzer failed: {}", e.what()); |
191 | 0 | } |
192 | | |
193 | 108 | InvertedIndexAnalyzerCtx analyzer_ctx; |
194 | 108 | analyzer_ctx.analyzer_name = config.analyzer_name; |
195 | 108 | analyzer_ctx.parser_type = config.parser_type; |
196 | 108 | analyzer_ctx.char_filter_map = config.char_filter_map; |
197 | 108 | analyzer_ctx.analyzer = analyzer_holder; |
198 | 108 | _do_tokenize(*col_left, analyzer_ctx, support_phrase, dest_column_ptr); |
199 | | |
200 | 108 | block.replace_by_position(result, std::move(dest_column_ptr)); |
201 | 108 | return Status::OK(); |
202 | 108 | } |
203 | 150 | } |
204 | 0 | return Status::RuntimeError("unimplemented function {}", get_name()); |
205 | 150 | } |
206 | | |
207 | 13 | void register_function_tokenize(SimpleFunctionFactory& factory) { |
208 | 13 | factory.register_function<FunctionTokenize>(); |
209 | 13 | } |
210 | | |
211 | | #include "common/compile_check_end.h" |
212 | | } // namespace doris |