AnalyzerIdentityBuilder.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.analysis.invertedindex;
import org.apache.doris.catalog.Env;
import org.apache.doris.indexpolicy.IndexPolicy;
import org.apache.doris.indexpolicy.IndexPolicyTypeEnum;
import com.google.common.base.Strings;
import org.apache.logging.log4j.Logger;
import java.util.Map;
import java.util.TreeMap;
public final class AnalyzerIdentityBuilder {
private AnalyzerIdentityBuilder() {
}
public static String buildAnalyzerIdentity(
Map<String, String> properties,
String preferredAnalyzer,
String parser,
String defaultAnalyzerKey,
String parserNone,
Logger log) {
if (properties == null || properties.isEmpty()) {
return defaultAnalyzerKey;
}
if (!Strings.isNullOrEmpty(preferredAnalyzer)) {
// For custom analyzer/normalizer, resolve to underlying config to build identity
return resolveAnalyzerIdentity(preferredAnalyzer, defaultAnalyzerKey, log);
}
if (Strings.isNullOrEmpty(parser) || parserNone.equalsIgnoreCase(parser)) {
return defaultAnalyzerKey;
}
return parser;
}
/**
* Resolve analyzer/normalizer name to its underlying configuration identity.
* Two analyzers with same underlying config (tokenizer + token_filter + char_filter)
* will have the same identity, even if they have different names.
*/
private static String resolveAnalyzerIdentity(String analyzerName, String defaultAnalyzerKey, Logger log) {
if (Strings.isNullOrEmpty(analyzerName)) {
return defaultAnalyzerKey;
}
// Check if it's a built-in analyzer
if (IndexPolicy.BUILTIN_ANALYZERS.contains(analyzerName)) {
return analyzerName;
}
// Check if it's a built-in normalizer
if (IndexPolicy.BUILTIN_NORMALIZERS.contains(analyzerName)) {
return "normalizer:" + analyzerName;
}
// For custom analyzer/normalizer, get underlying config from IndexPolicyMgr
try {
Env env = Env.getCurrentEnv();
if (env == null || env.getIndexPolicyMgr() == null) {
// Env not initialized - this can happen during early startup or tests
if (log != null) {
log.debug("Env or IndexPolicyMgr not available, using name '{}' as identity", analyzerName);
}
return analyzerName;
}
IndexPolicy policy = env.getIndexPolicyMgr().getPolicyByName(analyzerName);
if (policy == null) {
// Policy not found - this is expected for custom analyzers not yet registered
if (log != null) {
log.debug("Analyzer/normalizer policy not found for '{}', using name as identity", analyzerName);
}
return analyzerName;
}
Map<String, String> policyProps = policy.getProperties();
if (policyProps == null || policyProps.isEmpty()) {
if (log != null) {
log.debug("Policy '{}' has no properties, using name as identity", analyzerName);
}
return analyzerName;
}
// Build identity from underlying config using sorted keys for consistent ordering
return buildIdentityFromPolicyProperties(policy.getType(), policyProps);
} catch (RuntimeException e) {
// Catch RuntimeException specifically rather than generic Exception
if (log != null) {
log.warn("Failed to resolve analyzer identity for '{}', using name as identity. "
+ "This may cause incorrect duplicate detection. Error: {}",
analyzerName, e.getMessage());
}
return analyzerName;
}
}
/**
* Build identity string from policy properties.
* Uses TreeMap to ensure consistent key ordering.
*/
private static String buildIdentityFromPolicyProperties(IndexPolicyTypeEnum type,
Map<String, String> properties) {
// Use TreeMap to sort keys for consistent identity
TreeMap<String, String> sortedProps = new TreeMap<>(properties);
StringBuilder sb = new StringBuilder();
sb.append(type.name()).append(":");
for (Map.Entry<String, String> entry : sortedProps.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
// For tokenizer, token_filter, char_filter - resolve recursively if needed
if (IndexPolicy.PROP_TOKENIZER.equals(key)) {
sb.append("tokenizer=").append(resolveComponentIdentity(value, IndexPolicyTypeEnum.TOKENIZER));
} else if (IndexPolicy.PROP_TOKEN_FILTER.equals(key)) {
sb.append("token_filter=").append(resolveTokenFilterIdentity(value));
} else if (IndexPolicy.PROP_CHAR_FILTER.equals(key)) {
sb.append("char_filter=").append(resolveCharFilterIdentity(value));
}
sb.append(";");
}
return sb.toString();
}
/**
* Resolve a component (tokenizer) to its identity.
*/
private static String resolveComponentIdentity(String name, IndexPolicyTypeEnum expectedType) {
if (Strings.isNullOrEmpty(name)) {
return "";
}
// Check if it's a built-in component
if (expectedType == IndexPolicyTypeEnum.TOKENIZER
&& IndexPolicy.BUILTIN_TOKENIZERS.contains(name)) {
return name;
}
// For custom component, get its properties
try {
Env env = Env.getCurrentEnv();
if (env == null || env.getIndexPolicyMgr() == null) {
return name;
}
IndexPolicy policy = env.getIndexPolicyMgr().getPolicyByName(name);
if (policy == null || policy.getType() != expectedType) {
return name;
}
Map<String, String> props = policy.getProperties();
if (props == null || props.isEmpty()) {
return name;
}
// Build identity from sorted properties
TreeMap<String, String> sortedProps = new TreeMap<>(props);
return sortedProps.toString();
} catch (RuntimeException e) {
return name;
}
}
/**
* Resolve token filter list to identity string.
* IMPORTANT: Order is preserved because filter order is semantically significant.
*/
private static String resolveTokenFilterIdentity(String filterList) {
if (Strings.isNullOrEmpty(filterList)) {
return "";
}
StringBuilder sb = new StringBuilder();
String[] filters = filterList.split(",\\s*");
// DO NOT sort - filter order is semantically significant
for (int i = 0; i < filters.length; i++) {
String filter = filters[i].trim();
if (i > 0) {
sb.append(",");
}
if (IndexPolicy.BUILTIN_TOKEN_FILTERS.contains(filter)) {
sb.append(filter);
} else {
sb.append(resolveComponentIdentity(filter, IndexPolicyTypeEnum.TOKEN_FILTER));
}
}
return sb.toString();
}
/**
* Resolve char filter list to identity string.
* IMPORTANT: Order is preserved because filter order is semantically significant.
*/
private static String resolveCharFilterIdentity(String filterList) {
if (Strings.isNullOrEmpty(filterList)) {
return "";
}
StringBuilder sb = new StringBuilder();
String[] filters = filterList.split(",\\s*");
// DO NOT sort - filter order is semantically significant
for (int i = 0; i < filters.length; i++) {
String filter = filters[i].trim();
if (i > 0) {
sb.append(",");
}
if (IndexPolicy.BUILTIN_CHAR_FILTERS.contains(filter)) {
sb.append(filter);
} else {
sb.append(resolveComponentIdentity(filter, IndexPolicyTypeEnum.CHAR_FILTER));
}
}
return sb.toString();
}
}