S3SourceOffsetProvider.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.job.offset.s3;

import org.apache.doris.datasource.property.storage.StorageProperties;
import org.apache.doris.fs.FileSystemFactory;
import org.apache.doris.fs.GlobListResult;
import org.apache.doris.fs.remote.RemoteFile;
import org.apache.doris.fs.remote.RemoteFileSystem;
import org.apache.doris.job.extensions.insert.streaming.StreamingJobProperties;
import org.apache.doris.job.offset.Offset;
import org.apache.doris.job.offset.SourceOffsetProvider;
import org.apache.doris.nereids.analyzer.UnboundTVFRelation;
import org.apache.doris.nereids.trees.expressions.Properties;
import org.apache.doris.nereids.trees.plans.Plan;
import org.apache.doris.nereids.trees.plans.commands.insert.InsertIntoTableCommand;
import org.apache.doris.nereids.trees.plans.logical.LogicalPlan;
import org.apache.doris.persist.gson.GsonUtils;

import com.google.common.collect.Maps;
import com.google.gson.Gson;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

@Log4j2
public class S3SourceOffsetProvider implements SourceOffsetProvider {
    S3Offset currentOffset;
    String maxEndFile;

    @Override
    public String getSourceType() {
        return "s3";
    }

    @Override
    public S3Offset getNextOffset(StreamingJobProperties jobProps, Map<String, String> properties) {
        Map<String, String> copiedProps = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER);
        copiedProps.putAll(properties);
        S3Offset offset = new S3Offset();
        List<RemoteFile> rfiles = new ArrayList<>();
        String startFile = currentOffset == null ? null : currentOffset.endFile;
        String filePath = null;
        StorageProperties storageProperties = StorageProperties.createPrimary(copiedProps);
        try (RemoteFileSystem fileSystem = FileSystemFactory.get(storageProperties)) {
            String uri = storageProperties.validateAndGetUri(copiedProps);
            filePath = storageProperties.validateAndNormalizeUri(uri);
            GlobListResult globListResult = fileSystem.globListWithLimit(filePath, rfiles, startFile,
                    jobProps.getS3BatchBytes(), jobProps.getS3BatchFiles());

            if (!rfiles.isEmpty()) {
                String bucket = globListResult.getBucket();
                String prefix = globListResult.getPrefix();

                String bucketBase = "s3://" + bucket + "/";
                // Get the path of the last directory
                int lastSlash = prefix.lastIndexOf('/');
                String basePrefix = (lastSlash >= 0) ? prefix.substring(0, lastSlash + 1) : "";
                String filePathBase = bucketBase + basePrefix;
                String joined = rfiles.stream()
                        .filter(name -> !name.equals(filePathBase)) // Single file case
                        .map(path -> path.getName().replace(filePathBase, ""))
                        .collect(Collectors.joining(","));

                if (joined.isEmpty()) {
                    // base is a single file
                    offset.setFileLists(filePathBase);
                    String lastFile = rfiles.get(rfiles.size() - 1).getName().replace(bucketBase, "");
                    offset.setEndFile(lastFile);
                } else {
                    // base is dir
                    String normalizedPrefix = basePrefix.endsWith("/")
                            ? basePrefix.substring(0, basePrefix.length() - 1) : basePrefix;
                    String finalFileLists = String.format("s3://%s/%s/{%s}", bucket, normalizedPrefix, joined);
                    String lastFile = rfiles.get(rfiles.size() - 1).getName().replace(bucketBase, "");
                    offset.setFileLists(finalFileLists);
                    offset.setEndFile(lastFile);
                }
                maxEndFile = globListResult.getMaxFile();
            } else {
                throw new RuntimeException("No new files found in path: " + filePath);
            }
        } catch (Exception e) {
            log.warn("list path exception, path={}", filePath, e);
            throw new RuntimeException(e);
        }
        return offset;
    }

    @Override
    public String getShowCurrentOffset() {
        if (currentOffset != null) {
            return currentOffset.toSerializedJson();
        }
        return null;
    }

    @Override
    public String getShowMaxOffset() {
        Map<String, String> res = new HashMap<>();
        res.put("endFile", maxEndFile);
        return new Gson().toJson(res);
    }

    @Override
    public InsertIntoTableCommand rewriteTvfParams(InsertIntoTableCommand originCommand, Offset runningOffset) {
        S3Offset offset = (S3Offset) runningOffset;
        Map<String, String> props = new HashMap<>();
        // rewrite plan
        Plan rewritePlan = originCommand.getParsedPlan().get().rewriteUp(plan -> {
            if (plan instanceof UnboundTVFRelation) {
                UnboundTVFRelation originTvfRel = (UnboundTVFRelation) plan;
                Map<String, String> oriMap = originTvfRel.getProperties().getMap();
                props.putAll(oriMap);
                props.put("uri", offset.getFileLists());
                return new UnboundTVFRelation(
                        originTvfRel.getRelationId(), originTvfRel.getFunctionName(), new Properties(props));
            }
            return plan;
        });
        return new InsertIntoTableCommand((LogicalPlan) rewritePlan, Optional.empty(), Optional.empty(),
                Optional.empty(), true, Optional.empty());
    }

    @Override
    public void updateOffset(Offset offset) {
        this.currentOffset = (S3Offset) offset;
        this.currentOffset.setFileLists(null);
    }

    @Override
    public void fetchRemoteMeta(Map<String, String> properties) throws Exception {
        Map<String, String> copiedProps = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER);
        copiedProps.putAll(properties);
        StorageProperties storageProperties = StorageProperties.createPrimary(copiedProps);
        String startFile = currentOffset == null ? null : currentOffset.endFile;
        try (RemoteFileSystem fileSystem = FileSystemFactory.get(storageProperties)) {
            String uri = storageProperties.validateAndGetUri(copiedProps);
            String filePath = storageProperties.validateAndNormalizeUri(uri);
            List<RemoteFile> objects = new ArrayList<>();
            GlobListResult globListResult = fileSystem.globListWithLimit(filePath, objects, startFile, 1, 1);
            if (globListResult != null && !objects.isEmpty() && StringUtils.isNotEmpty(globListResult.getMaxFile())) {
                maxEndFile = globListResult.getMaxFile();
            } else {
                maxEndFile = startFile;
            }
        } catch (Exception e) {
            throw e;
        }
    }

    @Override
    public boolean hasMoreDataToConsume() {
        if (currentOffset == null) {
            return true;
        }
        if (currentOffset.endFile.compareTo(maxEndFile) < 0) {
            return true;
        }
        return false;
    }

    @Override
    public Offset deserializeOffset(String offset) {
        return GsonUtils.GSON.fromJson(offset, S3Offset.class);
    }
}