AnalyzerSelector.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.analysis;
import org.apache.doris.catalog.Index;
import com.google.common.base.Strings;
import java.util.Collections;
import java.util.Map;
/**
* Helper for selecting the analyzer that should be attached to a MATCH predicate.
*
* <p>Design principles:
* <ul>
* <li>FE transmits user intent clearly to BE</li>
* <li>Empty string = user did not specify analyzer (BE chooses)</li>
* <li>Non-empty string = user explicitly specified analyzer (BE exact matches)</li>
* </ul>
*
* <p>This simple protocol eliminates ambiguity between "no preference" and "none" analyzer.
*/
public final class AnalyzerSelector {
private AnalyzerSelector() {
}
/**
* Select analyzer configuration based on index and user request.
*
* @param index The inverted index (may be null if table has no index)
* @param requestedAnalyzer User-specified analyzer from USING ANALYZER clause (may be null)
* @return Selection containing analyzer and parser information
*/
public static Selection select(Index index, String requestedAnalyzer) {
Map<String, String> properties = index == null ? Collections.emptyMap() : index.getProperties();
return select(properties, requestedAnalyzer, index == null);
}
/**
* Select analyzer configuration from properties map.
* Used when Index object is not available.
*/
public static Selection select(Map<String, String> properties, String requestedAnalyzer) {
return select(properties, requestedAnalyzer, properties == null);
}
private static Selection select(Map<String, String> properties, String requestedAnalyzer, boolean noIndex) {
if (properties == null) {
properties = Collections.emptyMap();
}
// 1. Normalize user request
String userAnalyzer = normalize(requestedAnalyzer);
// 2. Get index configuration
String indexParser = InvertedIndexUtil.getInvertedIndexParser(properties);
String indexCustomAnalyzer = InvertedIndexUtil.getPreferredAnalyzer(properties);
// 3. Determine parser for slow path tokenization
// - No index: use english (more aggressive tokenization)
// - Has index with explicit parser: use index's configured parser
// - Has index with custom analyzer (no explicit parser): use english
// (custom analyzer implies fulltext tokenization)
// - Has index but no analyzer/parser: use none (keyword matching)
String parser;
if (noIndex) {
parser = InvertedIndexUtil.INVERTED_INDEX_PARSER_ENGLISH;
} else if (!Strings.isNullOrEmpty(indexParser)
&& !InvertedIndexUtil.INVERTED_INDEX_PARSER_NONE.equalsIgnoreCase(indexParser)) {
parser = indexParser;
} else if (!Strings.isNullOrEmpty(indexCustomAnalyzer)) {
// Custom analyzer exists but no explicit parser
// Use english for slow-path tokenization since custom analyzer implies fulltext
parser = InvertedIndexUtil.INVERTED_INDEX_PARSER_ENGLISH;
} else {
parser = InvertedIndexUtil.INVERTED_INDEX_PARSER_NONE;
}
// 4. Create selection
// - userAnalyzer: what user explicitly specified (empty if not specified)
// - indexAnalyzer: what the index is configured with (for slow path fallback)
String indexAnalyzer = determineIndexAnalyzer(indexCustomAnalyzer, indexParser);
return new Selection(userAnalyzer, indexAnalyzer, parser, !Strings.isNullOrEmpty(userAnalyzer));
}
/**
* Determine the analyzer configured on the index.
* Priority: custom analyzer > builtin parser (if not "none")
*/
private static String determineIndexAnalyzer(String customAnalyzer, String parser) {
if (!Strings.isNullOrEmpty(customAnalyzer)) {
return customAnalyzer.trim().toLowerCase();
}
if (!Strings.isNullOrEmpty(parser)
&& !InvertedIndexUtil.INVERTED_INDEX_PARSER_NONE.equalsIgnoreCase(parser)) {
return parser.trim().toLowerCase();
}
// Keyword index (parser=none) or no analyzer configured
return "";
}
private static String normalize(String analyzer) {
return analyzer == null ? "" : analyzer.trim().toLowerCase();
}
/**
* Result of analyzer selection.
*
* <p>Key semantics:
* <ul>
* <li>{@link #effectiveAnalyzerName()}: What to send to BE for index selection</li>
* <li>{@link #parser()}: Parser type for slow path tokenization</li>
* </ul>
*/
public static final class Selection {
// User explicitly specified analyzer (empty if not specified)
private final String userAnalyzer;
// Analyzer configured on index (empty for keyword index)
private final String indexAnalyzer;
// Parser type for slow path
private final String parser;
// Whether user explicitly specified an analyzer
private final boolean explicit;
private Selection(String userAnalyzer, String indexAnalyzer, String parser, boolean explicit) {
this.userAnalyzer = userAnalyzer;
this.indexAnalyzer = indexAnalyzer;
this.parser = Strings.isNullOrEmpty(parser)
? InvertedIndexUtil.INVERTED_INDEX_PARSER_NONE
: parser.trim().toLowerCase();
this.explicit = explicit;
}
/**
* Get the analyzer to use. For backwards compatibility, returns either
* the user-specified analyzer or the index-configured analyzer.
*/
public String analyzer() {
return !Strings.isNullOrEmpty(userAnalyzer) ? userAnalyzer : indexAnalyzer;
}
/**
* Get the parser type for slow path tokenization.
*/
public String parser() {
return parser;
}
/**
* Whether user explicitly specified an analyzer via USING ANALYZER clause.
*/
public boolean explicit() {
return explicit;
}
/**
* Compute the analyzer name to send to BE.
*
* <p>Protocol:
* <ul>
* <li>User specified analyzer → send that analyzer name (BE will exact match)</li>
* <li>User did not specify → send empty string (BE will auto-select)</li>
* </ul>
*
* <p>This simple protocol clearly distinguishes:
* <ul>
* <li>"MATCH(col, 'query')" → empty string → BE chooses best index</li>
* <li>"MATCH(col, 'query') USING ANALYZER none" → "none" → BE uses keyword index</li>
* <li>"MATCH(col, 'query') USING ANALYZER chinese" → "chinese" → BE uses chinese index</li>
* </ul>
*
* @param hasIndex Whether the table has an inverted index
* @param fallbackParser Unused, kept for API compatibility
* @return Analyzer name for BE, or empty string if not specified
*/
public String effectiveAnalyzerName(boolean hasIndex, String fallbackParser) {
// Simple and clear: return what user specified, or empty if not specified
return userAnalyzer;
}
}
}