FilePartitionUtils.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.datasource;
import org.apache.doris.common.UserException;
import org.apache.doris.datasource.hive.HiveExternalMetaCache;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Utility methods for parsing partition column values from Hive-style file paths.
*
* <p>Hive partitioned tables encode partition key-value pairs in the directory path
* (e.g. {@code hdfs://nn/tbl/year=2024/month=01/file.parquet}). These helpers extract
* the column values in the order given by {@code columnsFromPath}.
*/
public final class FilePartitionUtils {
private FilePartitionUtils() {}
public static final class ParsedColumnsFromPath {
private final List<String> values;
private final List<Boolean> isNull;
private ParsedColumnsFromPath(List<String> values, List<Boolean> isNull) {
this.values = values;
this.isNull = isNull;
}
public List<String> getValues() {
return values;
}
public List<Boolean> getIsNull() {
return isNull;
}
}
/**
* Parses partition column values from a Hive-style file path using case-sensitive matching.
*
* @param filePath absolute file path that contains partition segments
* @param columnsFromPath ordered list of partition column names to extract
* @return ordered list of partition values (parallel to {@code columnsFromPath})
* @throws UserException if the path does not contain the expected partition segments
*/
public static List<String> parseColumnsFromPath(String filePath, List<String> columnsFromPath)
throws UserException {
return parseColumnsFromPathWithNullInfo(filePath, columnsFromPath, true, false).getValues();
}
/**
* Parses partition column values from a Hive-style file path.
*
* @param filePath absolute file path that contains partition segments
* @param columnsFromPath ordered list of partition column names to extract
* @param caseSensitive whether column name matching is case-sensitive
* @param isACID whether the path follows the ACID layout
* ({@code table/par=val/delta_xxx/file}), which adds one extra path level
* @return ordered list of partition values (parallel to {@code columnsFromPath})
* @throws UserException if the path does not contain the expected partition segments
*/
public static List<String> parseColumnsFromPath(
String filePath,
List<String> columnsFromPath,
boolean caseSensitive,
boolean isACID)
throws UserException {
return parseColumnsFromPathWithNullInfo(filePath, columnsFromPath, caseSensitive, isACID)
.getValues();
}
public static ParsedColumnsFromPath parseColumnsFromPathWithNullInfo(
String filePath,
List<String> columnsFromPath,
boolean caseSensitive,
boolean isACID)
throws UserException {
if (columnsFromPath == null || columnsFromPath.isEmpty()) {
return new ParsedColumnsFromPath(Collections.emptyList(), Collections.emptyList());
}
// ACID paths have one extra level: table/par=val/delta_xxx/file → pathCount = 3
int pathCount = isACID ? 3 : 2;
List<String> expectedColumns = columnsFromPath;
if (!caseSensitive) {
expectedColumns = new ArrayList<>(columnsFromPath.size());
for (String path : columnsFromPath) {
expectedColumns.add(path.toLowerCase());
}
}
String[] strings = filePath.split("/");
if (strings.length < 2) {
throw new UserException("Fail to parse columnsFromPath, expected: "
+ expectedColumns + ", filePath: " + filePath);
}
String[] columns = new String[expectedColumns.size()];
Boolean[] columnValueIsNull = new Boolean[expectedColumns.size()];
int size = 0;
boolean skipOnce = true;
for (int i = strings.length - pathCount; i >= 0; i--) {
String str = strings[i];
if (str != null && str.isEmpty()) {
continue;
}
if (str == null || !str.contains("=")) {
if (!isACID && skipOnce) {
skipOnce = false;
continue;
}
throw new UserException("Fail to parse columnsFromPath, expected: "
+ expectedColumns + ", filePath: " + filePath);
}
skipOnce = false;
String[] pair = str.split("=", 2);
if (pair.length != 2) {
throw new UserException("Fail to parse columnsFromPath, expected: "
+ expectedColumns + ", filePath: " + filePath);
}
String parsedColumnName = caseSensitive ? pair[0] : pair[0].toLowerCase();
int index = expectedColumns.indexOf(parsedColumnName);
if (index == -1) {
continue;
}
boolean isNull = HiveExternalMetaCache.HIVE_DEFAULT_PARTITION.equals(pair[1]);
columns[index] = isNull ? "" : pair[1];
columnValueIsNull[index] = isNull;
size++;
if (size >= expectedColumns.size()) {
break;
}
}
if (size != expectedColumns.size()) {
throw new UserException("Fail to parse columnsFromPath, expected: "
+ expectedColumns + ", filePath: " + filePath);
}
return new ParsedColumnsFromPath(Lists.newArrayList(columns), Lists.newArrayList(columnValueIsNull));
}
public static ParsedColumnsFromPath normalizeColumnsFromPath(List<String> columnsFromPath) {
if (columnsFromPath == null || columnsFromPath.isEmpty()) {
return new ParsedColumnsFromPath(Collections.emptyList(), Collections.emptyList());
}
List<String> values = new ArrayList<>(columnsFromPath.size());
List<Boolean> isNull = new ArrayList<>(columnsFromPath.size());
for (String value : columnsFromPath) {
boolean nullValue = value == null || HiveExternalMetaCache.HIVE_DEFAULT_PARTITION.equals(value);
values.add(nullValue ? "" : value);
isNull.add(nullValue);
}
return new ParsedColumnsFromPath(values, isNull);
}
}