S3URI.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.common.util;

import org.apache.doris.common.UserException;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import org.apache.commons.lang3.StringUtils;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * This class represents a fully qualified location in S3 for input/output
 * operations expressed as as URI.
 * <p>
 * For AWS S3, uri common styles should be:
 * 1. AWS Client Style(Hadoop S3 Style): s3://my-bucket/path/to/file?versionId=abc123&partNumber=77&partNumber=88
 * or
 * 2. Virtual Host Style: https://my-bucket.s3.us-west-1.amazonaws.com/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
 * or
 * 3. Path Style: https://s3.us-west-1.amazonaws.com/my-bucket/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
 *
 * Regarding the above-mentioned common styles, we can use <code>isPathStyle</code> to control whether to use path style
 * or virtual host style.
 * "Virtual host style" is the currently mainstream and recommended approach to use, so the default value of
 * <code>isPathStyle</code> is false.
 *
 * Other Styles:
 * 1. Virtual Host AWS Client (Hadoop S3) Mixed Style:
 * s3://my-bucket.s3.us-west-1.amazonaws.com/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
 * or
 * 2. Path AWS Client (Hadoop S3) Mixed Style:
 * s3://s3.us-west-1.amazonaws.com/my-bucket/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
 *
 * For these two styles, we can use <code>isPathStyle</code> and <code>forceParsingByStandardUri</code>
 * to control whether to use.
 * Virtual Host AWS Client (Hadoop S3) Mixed Style: <code>isPathStyle = false && forceParsingByStandardUri = true</code>
 * Path AWS Client (Hadoop S3) Mixed Style: <code>isPathStyle = true && forceParsingByStandardUri = true</code>
 *
 */

public class S3URI {

    private static final Pattern URI_PATTERN =
            Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?");
    public static final String SCHEME_DELIM = "://";
    public static final String PATH_DELIM = "/";
    private static final Set<String> VALID_SCHEMES = ImmutableSet.of("http", "https", "s3", "s3a", "s3n",
            "bos", "oss", "cos", "cosn", "obs", "azure");

    private static final Set<String> OS_SCHEMES = ImmutableSet.of("s3", "s3a", "s3n",
            "bos", "oss", "cos", "cosn", "obs", "azure");

    private URI uri;

    private String bucket;
    private String key;

    private String endpoint;

    private String region;

    private boolean isStandardURL;
    private boolean isPathStyle;
    private Map<String, List<String>> queryParams;

    /**
     * Creates a new S3URI based on the bucket and key parsed from the location as defined in:
     * https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html#access-bucket-intro
     * <p>
     * Supported access styles are Virtual Hosted addresses and s3://... URIs with additional
     * 's3n' and 's3a' schemes supported for backwards compatibility.
     *
     * @param location fully qualified URI
     */
    public static S3URI create(String location) throws UserException {
        return create(location, false, false);
    }

    public static S3URI create(String location, boolean isPathStyle) throws UserException {
        return new S3URI(location, isPathStyle, false);
    }

    public static S3URI create(String location, boolean isPathStyle, boolean forceParsingByStandardUri)
            throws UserException {
        return new S3URI(location, isPathStyle, forceParsingByStandardUri);
    }

    private S3URI(String location, boolean isPathStyle, boolean forceParsingByStandardUri) throws UserException {
        if (Strings.isNullOrEmpty(location)) {
            throw new UserException("s3 location can not be null");
        }
        this.isPathStyle = isPathStyle;
        parseUri(location, forceParsingByStandardUri);
    }

    private void parseUri(String location, boolean forceParsingStandardUri) throws UserException {
        parseURILocation(location);
        validateUri();

        if (!forceParsingStandardUri && OS_SCHEMES.contains(uri.getScheme().toLowerCase())) {
            parseAwsCliStyleUri();
        } else {
            parseStandardUri();
        }
        parseEndpointAndRegion();
    }

    /**
     * parse uri location and encode to a URI.
     * @param location
     * @throws UserException
     */
    private void parseURILocation(String location) throws UserException {
        Matcher matcher = URI_PATTERN.matcher(location);
        if (!matcher.matches()) {
            throw new UserException("Failed to parse uri: " + location);
        }
        String scheme = matcher.group(2);
        String authority = matcher.group(4);
        String path = matcher.group(5);
        String query = matcher.group(7);
        String fragment = matcher.group(9);
        try {
            uri = new URI(scheme, authority, path, query, fragment).normalize();
        } catch (URISyntaxException e) {
            throw new UserException(e);
        }
    }

    private void validateUri() throws UserException {
        if (uri.getScheme() == null || !VALID_SCHEMES.contains(uri.getScheme().toLowerCase())) {
            throw new UserException("Invalid scheme: " + this.uri);
        }
    }

    private void parseAwsCliStyleUri() throws UserException {
        bucket = uri.getAuthority();
        if (bucket == null) {
            throw new UserException("missing bucket: " + uri);
        }
        String path = uri.getPath();
        if (path.length() > 1) {
            key = path.substring(1);
        } else {
            throw new UserException("missing key: " + uri);
        }

        addQueryParamsIfNeeded();

        isStandardURL = false;
        this.isPathStyle = false;
    }

    private void parseStandardUri() throws UserException {
        if (uri.getHost() == null) {
            throw new UserException("Invalid S3 URI: no hostname: " + uri);
        }

        addQueryParamsIfNeeded();

        if (isPathStyle) {
            parsePathStyleUri();
        } else {
            parseVirtualHostedStyleUri();
        }
        isStandardURL = true;
    }

    private void addQueryParamsIfNeeded() {
        if (uri.getQuery() != null) {
            queryParams = splitQueryString(uri.getQuery()).stream().map((s) -> s.split("="))
                    .map((s) -> s.length == 1 ? new String[] {s[0], null} : s).collect(
                            Collectors.groupingBy((a) -> a[0],
                                    Collectors.mapping((a) -> a[1], Collectors.toList())));
        }
    }

    private static List<String> splitQueryString(String queryString) {
        List<String> results = new ArrayList<>();
        StringBuilder result = new StringBuilder();

        for (int i = 0; i < queryString.length(); ++i) {
            char character = queryString.charAt(i);
            if (character != '&') {
                result.append(character);
            } else {
                String param = result.toString();
                results.add(param);
                result.setLength(0);
            }
        }

        String param = result.toString();
        results.add(param);
        return results;
    }

    private void parsePathStyleUri() throws UserException {
        String path = uri.getPath();

        if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
            int index = path.indexOf('/', 1);

            if (index == -1) {
                // No trailing slash, e.g., "https://s3.amazonaws.com/bucket"
                bucket = path.substring(1);
                throw new UserException("missing key: " + uri);
            } else {
                bucket = path.substring(1, index);
                if (index != path.length() - 1) {
                    key = path.substring(index + 1);
                } else {
                    throw new UserException("missing key: " + uri);
                }
            }
        } else {
            throw new UserException("missing bucket: " + this.uri);
        }
    }

    private void parseVirtualHostedStyleUri() throws UserException {
        bucket = uri.getHost().split("\\.")[0];

        String path = uri.getPath();
        if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
            key = path.substring(1);
        } else {
            throw new UserException("missing key: " + this.uri);
        }
    }

    private void parseEndpointAndRegion() {
        // parse endpoint
        if (isStandardURL) {
            if (isPathStyle) {
                endpoint = uri.getAuthority();
            } else { // virtual_host_style
                if (uri.getAuthority() == null) {
                    endpoint = null;
                    return;
                }
                String[] splits = uri.getAuthority().split("\\.", 2);
                if (splits.length < 2) {
                    endpoint = null;
                    return;
                }
                endpoint = splits[1];
            }
        } else {
            endpoint = null;
        }
        if (endpoint == null) {
            return;
        }

        // parse region
        String[] endpointSplits = endpoint.split("\\.");
        if (endpointSplits.length < 2) {
            return;
        }
        if (endpointSplits[0].contains("oss-")) {
            // compatible with the endpoint: oss-cn-bejing.aliyuncs.com
            region = endpointSplits[0];
            return;
        }
        region = endpointSplits[1];
    }

    /**
     * @return S3 bucket
     */
    public String getBucket() {
        return bucket;
    }

    /**
     * @return S3 key
     */
    public String getKey() {
        return key;
    }

    public Optional<Map<String, List<String>>> getQueryParams() {
        return Optional.ofNullable(queryParams);
    }

    public Optional<String> getEndpoint() {
        return Optional.ofNullable(endpoint);
    }

    public Optional<String> getRegion() {
        return Optional.ofNullable(region);
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("S3URI{");
        sb.append("uri=").append(uri);
        sb.append(", bucket='").append(bucket).append('\'');
        sb.append(", key='").append(key).append('\'');
        sb.append(", endpoint='").append(endpoint).append('\'');
        sb.append(", region='").append(region).append('\'');
        sb.append(", isStandardURL=").append(isStandardURL);
        sb.append(", isPathStyle=").append(isPathStyle);
        sb.append(", queryParams=").append(queryParams);
        sb.append('}');
        return sb.toString();
    }
}