PinyinTokenizerValidator.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.indexpolicy;
import org.apache.doris.common.DdlException;
import com.google.common.collect.ImmutableSet;
import java.util.Map;
import java.util.Set;
public class PinyinTokenizerValidator extends BasePolicyValidator {
private static final Set<String> ALLOWED_PROPS = ImmutableSet.of(
"type",
"lowercase",
"trim_whitespace",
"keep_none_chinese",
"keep_none_chinese_in_first_letter",
"keep_none_chinese_in_joined_full_pinyin",
"keep_original",
"keep_first_letter",
"keep_separate_first_letter",
"keep_none_chinese_together",
"none_chinese_pinyin_tokenize",
"limit_first_letter_length",
"keep_full_pinyin",
"keep_joined_full_pinyin",
"remove_duplicated_term",
"fixed_pinyin_offset",
"ignore_pinyin_offset",
"keep_separate_chinese"
);
public PinyinTokenizerValidator() {
super(ALLOWED_PROPS);
}
@Override
protected String getTypeName() {
return "pinyin tokenizer";
}
@Override
protected void validateSpecific(Map<String, String> props) throws DdlException {
// validate boolean parameter
validateBooleanParameter(props, "lowercase");
validateBooleanParameter(props, "trim_whitespace");
validateBooleanParameter(props, "keep_none_chinese");
validateBooleanParameter(props, "keep_none_chinese_in_first_letter");
validateBooleanParameter(props, "keep_none_chinese_in_joined_full_pinyin");
validateBooleanParameter(props, "keep_original");
validateBooleanParameter(props, "keep_first_letter");
validateBooleanParameter(props, "keep_separate_first_letter");
validateBooleanParameter(props, "keep_none_chinese_together");
validateBooleanParameter(props, "none_chinese_pinyin_tokenize");
validateBooleanParameter(props, "keep_full_pinyin");
validateBooleanParameter(props, "keep_joined_full_pinyin");
validateBooleanParameter(props, "remove_duplicated_term");
validateBooleanParameter(props, "fixed_pinyin_offset");
validateBooleanParameter(props, "ignore_pinyin_offset");
validateBooleanParameter(props, "keep_separate_chinese");
if (props.containsKey("limit_first_letter_length")) {
try {
int limitLength = Integer.parseInt(props.get("limit_first_letter_length"));
if (limitLength < 0) {
throw new DdlException("limit_first_letter_length must be a non-negative integer (default: 16)");
}
} catch (NumberFormatException e) {
throw new DdlException("limit_first_letter_length must be a non-negative integer (default: 16)");
}
}
validateConfigurationLogic(props);
}
/**
* validate boolean parameter
*/
private void validateBooleanParameter(Map<String, String> props, String paramName) throws DdlException {
if (props.containsKey(paramName)) {
String value = props.get(paramName).toLowerCase();
if (!"true".equals(value) && !"false".equals(value)) {
throw new DdlException(paramName + " must be 'true' or 'false'");
}
}
}
/**
* validate configuration logic
*/
private void validateConfigurationLogic(Map<String, String> props) throws DdlException {
// ensure at least one output format is enabled
boolean keepOriginal = getBooleanValue(props, "keep_original", false);
boolean keepFirstLetter = getBooleanValue(props, "keep_first_letter", true);
boolean keepFullPinyin = getBooleanValue(props, "keep_full_pinyin", true);
boolean keepJoinedFullPinyin = getBooleanValue(props, "keep_joined_full_pinyin", false);
boolean keepSeparateFirstLetter = getBooleanValue(props, "keep_separate_first_letter", false);
boolean keepSeparateChinese = getBooleanValue(props, "keep_separate_chinese", false);
if (!keepOriginal && !keepFirstLetter && !keepFullPinyin
&& !keepJoinedFullPinyin && !keepSeparateFirstLetter && !keepSeparateChinese) {
throw new DdlException("At least one output format must be enabled: "
+ "keep_original, keep_first_letter, keep_full_pinyin, keep_joined_full_pinyin, "
+ "keep_separate_first_letter, or keep_separate_chinese");
}
// validate keep_separate_first_letter and keep_first_letter relationship
if (keepSeparateFirstLetter && !keepFirstLetter) {
throw new DdlException("keep_separate_first_letter requires keep_first_letter to be enabled");
}
// validate keep_none_chinese_in_first_letter and keep_first_letter relationship
boolean keepNoneChineseInFirstLetter = getBooleanValue(props, "keep_none_chinese_in_first_letter", true);
if (keepNoneChineseInFirstLetter && !keepFirstLetter) {
throw new DdlException("keep_none_chinese_in_first_letter requires keep_first_letter to be enabled");
}
// validate keep_none_chinese_in_joined_full_pinyin and keep_joined_full_pinyin relationship
boolean keepNoneChineseInJoinedFullPinyin = getBooleanValue(props,
"keep_none_chinese_in_joined_full_pinyin", false);
if (keepNoneChineseInJoinedFullPinyin && !keepJoinedFullPinyin) {
throw new DdlException("keep_none_chinese_in_joined_full_pinyin "
+ "requires keep_joined_full_pinyin to be enabled");
}
// validate limit_first_letter_length and keep_first_letter relationship
if (props.containsKey("limit_first_letter_length") && !keepFirstLetter) {
throw new DdlException("limit_first_letter_length is only valid when keep_first_letter is enabled");
}
}
/**
* get boolean value, support default value
*/
private boolean getBooleanValue(Map<String, String> props, String key, boolean defaultValue) {
if (!props.containsKey(key)) {
return defaultValue;
}
return "true".equals(props.get(key).toLowerCase());
}
}