SourceOffsetProvider.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.job.offset;
import org.apache.doris.job.exception.JobException;
import org.apache.doris.job.extensions.insert.streaming.StreamingInsertJob;
import org.apache.doris.job.extensions.insert.streaming.StreamingJobProperties;
import org.apache.doris.nereids.trees.plans.commands.insert.InsertIntoTableCommand;
import java.util.List;
import java.util.Map;
/**
* Interface for managing offsets and metadata of a data source.
*/
public interface SourceOffsetProvider {
/**
* Get source type, e.g. s3, kafka
*
* @return
*/
String getSourceType();
/**
* Initialize the offset provider with job ID and original TVF properties.
* Only sets in-memory fields; safe to call on both fresh start and FE restart.
* May perform remote calls (e.g. fetching snapshot splits), so throws JobException.
*/
default void ensureInitialized(Long jobId, Map<String, String> originTvfProps) throws JobException {}
/**
* Performs one-time initialization that must run only on fresh job creation, not on FE restart.
* For example, fetching and persisting snapshot splits to the meta table.
* Default: no-op (most providers need no extra setup).
*/
default void initOnCreate() throws JobException {}
/**
* Get next offset to consume
*
* @return
*/
Offset getNextOffset(StreamingJobProperties jobProps, Map<String, String> properties);
/**
* Get current offset to show
*
* @return
*/
String getShowCurrentOffset();
/**
* Get remote datasource max offset to show
*
* @return
*/
String getShowMaxOffset();
/**
* Rewrite the TVF parameters in the SQL based on the current offset.
* Only implemented by TVF-based providers (e.g. S3, cdc_stream).
*
* @param nextOffset
* @return rewritten InsertIntoTableCommand
*/
InsertIntoTableCommand rewriteTvfParams(InsertIntoTableCommand originCommand, Offset nextOffset, long taskId);
/**
* Update the offset of the source.
*
* @param offset
*/
void updateOffset(Offset offset);
/**
* Fetch remote meta information, such as listing files in S3 or getting latest offsets in Kafka.
*/
void fetchRemoteMeta(Map<String, String> properties) throws Exception;
/**
* Whether there is more data to consume
*
* @return
*/
boolean hasMoreDataToConsume();
/**
* Deserialize string offset to Offset
*
* @return
*/
Offset deserializeOffset(String offset);
/**
* Deserialize offset property to Offset
*
* @return
*/
Offset deserializeOffsetProperty(String offset);
/**
* Replaying OffsetProvider is currently only required by JDBC.
*
* @return
*/
default void replayIfNeed(StreamingInsertJob job) throws JobException {
}
default String getPersistInfo() {
return null;
}
/**
* Returns the serialized JSON offset to store in txn commit attachment.
* Default: serialize running offset directly (e.g. S3 path).
* CDC stream TVF overrides to pull actual end offset from BE after fetchRecordStream completes.
* scanBackendIds: IDs of the BEs that ran the TVF scan node, used to locate taskOffsetCache.
*/
default String getCommitOffsetJson(Offset runningOffset, long taskId, List<Long> scanBackendIds) {
return runningOffset.toSerializedJson();
}
/**
* Called after each task is committed. Providers that track data availability
* (e.g. JDBC binlog) can use this to update internal state such as hasMoreData.
* Default: no-op.
*/
default void onTaskCommitted(long scannedRows, long loadBytes) {}
/**
* Applies the end offset from a committed task back onto the running offset object
* in-place, so that showRange() can display the full [start, end] interval.
* Default: no-op (only meaningful for JDBC providers).
*/
default void applyEndOffsetToTask(Offset runningOffset, Offset endOffset) {}
/**
* Returns true if the provider has reached a natural completion point
* and the job should be marked as FINISHED.
* Default: false (most providers run indefinitely).
*/
default boolean hasReachedEnd() {
return false;
}
}