FilePartitionUtils.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.datasource;

import org.apache.doris.common.FeConstants;
import org.apache.doris.common.UserException;
import org.apache.doris.datasource.hive.HiveExternalMetaCache;

import com.google.common.collect.Lists;

import java.util.Collections;
import java.util.List;

/**
 * Utility methods for parsing partition column values from Hive-style file paths.
 *
 * <p>Hive partitioned tables encode partition key-value pairs in the directory path
 * (e.g. {@code hdfs://nn/tbl/year=2024/month=01/file.parquet}).  These helpers extract
 * the column values in the order given by {@code columnsFromPath}.
 */
public final class FilePartitionUtils {

    private FilePartitionUtils() {}

    /**
     * Parses partition column values from a Hive-style file path using case-sensitive matching.
     *
     * @param filePath        absolute file path that contains partition segments
     * @param columnsFromPath ordered list of partition column names to extract
     * @return ordered list of partition values (parallel to {@code columnsFromPath})
     * @throws UserException if the path does not contain the expected partition segments
     */
    public static List<String> parseColumnsFromPath(String filePath, List<String> columnsFromPath)
            throws UserException {
        return parseColumnsFromPath(filePath, columnsFromPath, true, false);
    }

    /**
     * Parses partition column values from a Hive-style file path.
     *
     * @param filePath        absolute file path that contains partition segments
     * @param columnsFromPath ordered list of partition column names to extract
     * @param caseSensitive   whether column name matching is case-sensitive
     * @param isACID          whether the path follows the ACID layout
     *                        ({@code table/par=val/delta_xxx/file}), which adds one extra path level
     * @return ordered list of partition values (parallel to {@code columnsFromPath})
     * @throws UserException if the path does not contain the expected partition segments
     */
    public static List<String> parseColumnsFromPath(
            String filePath,
            List<String> columnsFromPath,
            boolean caseSensitive,
            boolean isACID)
            throws UserException {
        if (columnsFromPath == null || columnsFromPath.isEmpty()) {
            return Collections.emptyList();
        }
        // ACID paths have one extra level: table/par=val/delta_xxx/file → pathCount = 3
        int pathCount = isACID ? 3 : 2;
        if (!caseSensitive) {
            for (int i = 0; i < columnsFromPath.size(); i++) {
                String path = columnsFromPath.remove(i);
                columnsFromPath.add(i, path.toLowerCase());
            }
        }
        String[] strings = filePath.split("/");
        if (strings.length < 2) {
            throw new UserException("Fail to parse columnsFromPath, expected: "
                    + columnsFromPath + ", filePath: " + filePath);
        }
        String[] columns = new String[columnsFromPath.size()];
        int size = 0;
        boolean skipOnce = true;
        for (int i = strings.length - pathCount; i >= 0; i--) {
            String str = strings[i];
            if (str != null && str.isEmpty()) {
                continue;
            }
            if (str == null || !str.contains("=")) {
                if (!isACID && skipOnce) {
                    skipOnce = false;
                    continue;
                }
                throw new UserException("Fail to parse columnsFromPath, expected: "
                        + columnsFromPath + ", filePath: " + filePath);
            }
            skipOnce = false;
            String[] pair = str.split("=", 2);
            if (pair.length != 2) {
                throw new UserException("Fail to parse columnsFromPath, expected: "
                        + columnsFromPath + ", filePath: " + filePath);
            }
            String parsedColumnName = caseSensitive ? pair[0] : pair[0].toLowerCase();
            int index = columnsFromPath.indexOf(parsedColumnName);
            if (index == -1) {
                continue;
            }
            columns[index] = HiveExternalMetaCache.HIVE_DEFAULT_PARTITION.equals(pair[1])
                ? FeConstants.null_string : pair[1];
            size++;
            if (size >= columnsFromPath.size()) {
                break;
            }
        }
        if (size != columnsFromPath.size()) {
            throw new UserException("Fail to parse columnsFromPath, expected: "
                    + columnsFromPath + ", filePath: " + filePath);
        }
        return Lists.newArrayList(columns);
    }
}