AbstractUnassignedScanJob.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.nereids.trees.plans.distribute.worker.job;
import org.apache.doris.nereids.trees.plans.distribute.worker.DistributedPlanWorker;
import org.apache.doris.nereids.trees.plans.distribute.worker.DistributedPlanWorkerManager;
import org.apache.doris.planner.ExchangeNode;
import org.apache.doris.planner.OlapScanNode;
import org.apache.doris.planner.PlanFragment;
import org.apache.doris.planner.ScanNode;
import org.apache.doris.qe.ConnectContext;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
/** AbstractUnassignedScanJob */
public abstract class AbstractUnassignedScanJob extends AbstractUnassignedJob {
protected final AtomicInteger shareScanIdGenerator = new AtomicInteger();
public AbstractUnassignedScanJob(PlanFragment fragment,
List<ScanNode> scanNodes, ListMultimap<ExchangeNode, UnassignedJob> exchangeToChildJob) {
super(fragment, scanNodes, exchangeToChildJob);
}
@Override
public List<AssignedJob> computeAssignedJobs(DistributedPlanWorkerManager workerManager,
ListMultimap<ExchangeNode, AssignedJob> inputJobs) {
Map<DistributedPlanWorker, UninstancedScanSource> workerToScanSource = multipleMachinesParallelization(
workerManager, inputJobs);
List<AssignedJob> assignedJobs = insideMachineParallelization(workerToScanSource, inputJobs, workerManager);
return fillUpAssignedJobs(assignedJobs, workerManager, inputJobs);
}
protected List<AssignedJob> fillUpAssignedJobs(
List<AssignedJob> assignedJobs,
DistributedPlanWorkerManager workerManager,
ListMultimap<ExchangeNode, AssignedJob> inputJobs) {
return assignedJobs;
}
protected abstract Map<DistributedPlanWorker, UninstancedScanSource> multipleMachinesParallelization(
DistributedPlanWorkerManager workerManager, ListMultimap<ExchangeNode, AssignedJob> inputJobs);
protected List<AssignedJob> insideMachineParallelization(
Map<DistributedPlanWorker, UninstancedScanSource> workerToScanRanges,
ListMultimap<ExchangeNode, AssignedJob> inputJobs, DistributedPlanWorkerManager workerManager) {
ConnectContext context = ConnectContext.get();
boolean useLocalShuffleToAddParallel = useLocalShuffleToAddParallel(workerToScanRanges);
int instanceIndexInFragment = 0;
List<AssignedJob> instances = Lists.newArrayList();
for (Entry<DistributedPlanWorker, UninstancedScanSource> entry : workerToScanRanges.entrySet()) {
DistributedPlanWorker worker = entry.getKey();
// the scanRanges which this worker should scan,
// for example:
// {
// scan tbl1: [tablet_10001, tablet_10002, tablet_10003, tablet_10004] // no instances
// }
ScanSource scanSource = entry.getValue().scanSource;
// usually, its tablets num, or buckets num
int scanSourceMaxParallel = scanSource.maxParallel(scanNodes);
// now we should compute how many instances to process the data,
// for example: two instances
int instanceNum = degreeOfParallelism(scanSourceMaxParallel);
List<ScanSource> instanceToScanRanges;
if (useLocalShuffleToAddParallel) {
// only generate one instance to scan all data, in this step
instanceToScanRanges = scanSource.parallelize(
scanNodes, 1
);
// Some tablets too big, we need add parallel to process these tablets after scan,
// for example, use one OlapScanNode to scan data, and use some local instances
// to process Aggregation parallel. We call it `share scan`. Backend will know this
// instances share the same ScanSource, and will not scan same data multiple times.
//
// +-------------------------------- same fragment in one host -------------------------------------+
// | instance1 instance2 instance3 instance4 |
// | \ \ / / |
// | |
// | OlapScanNode |
// |(share scan node, and local shuffle data to other local instances to parallel compute this data)|
// +------------------------------------------------------------------------------------------------+
ScanSource shareScanSource = instanceToScanRanges.get(0);
// one scan range generate multiple instances,
// different instances reference the same scan source
int shareScanId = shareScanIdGenerator.getAndIncrement();
for (int i = 0; i < instanceNum; i++) {
LocalShuffleAssignedJob instance = new LocalShuffleAssignedJob(
instanceIndexInFragment++, shareScanId, context.nextInstanceId(),
this, worker, shareScanSource);
instances.add(instance);
}
} else {
// split the scanRanges to some partitions, one partition for one instance
// for example:
// [
// scan tbl1: [tablet_10001, tablet_10003], // instance 1
// scan tbl1: [tablet_10002, tablet_10004] // instance 2
// ]
instanceToScanRanges = scanSource.parallelize(
scanNodes, instanceNum
);
for (ScanSource instanceToScanRange : instanceToScanRanges) {
instances.add(
assignWorkerAndDataSources(
instanceIndexInFragment++, context.nextInstanceId(), worker, instanceToScanRange
)
);
}
}
}
return instances;
}
protected boolean useLocalShuffleToAddParallel(
Map<DistributedPlanWorker, UninstancedScanSource> workerToScanRanges) {
if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().isForceToLocalShuffle()) {
return true;
}
return parallelTooLittle(workerToScanRanges);
}
protected boolean parallelTooLittle(Map<DistributedPlanWorker, UninstancedScanSource> workerToScanRanges) {
if (this instanceof UnassignedScanBucketOlapTableJob) {
return scanRangesToLittle(workerToScanRanges) && bucketsTooLittle(workerToScanRanges);
} else if (this instanceof UnassignedScanSingleOlapTableJob
|| this instanceof UnassignedScanSingleRemoteTableJob) {
return scanRangesToLittle(workerToScanRanges);
} else {
return false;
}
}
protected boolean scanRangesToLittle(
Map<DistributedPlanWorker, UninstancedScanSource> workerToScanRanges) {
ConnectContext context = ConnectContext.get();
int backendNum = workerToScanRanges.size();
for (ScanNode scanNode : scanNodes) {
if (!scanNode.ignoreStorageDataDistribution(context, backendNum)) {
return false;
}
}
return true;
}
protected int degreeOfParallelism(int maxParallel) {
Preconditions.checkArgument(maxParallel > 0, "maxParallel must be positive");
if (!fragment.getDataPartition().isPartitioned()) {
return 1;
}
if (scanNodes.size() == 1 && scanNodes.get(0) instanceof OlapScanNode) {
OlapScanNode olapScanNode = (OlapScanNode) scanNodes.get(0);
// if the scan node have limit and no conjuncts, only need 1 instance to save cpu and mem resource,
// e.g. select * from tbl limit 10
ConnectContext connectContext = ConnectContext.get();
if (connectContext != null && olapScanNode.shouldUseOneInstance(connectContext)) {
return 1;
}
}
// the scan instance num should not larger than the tablets num
return Math.min(maxParallel, Math.max(fragment.getParallelExecNum(), 1));
}
protected boolean bucketsTooLittle(Map<DistributedPlanWorker, UninstancedScanSource> workerToScanRanges) {
int parallelExecNum = fragment.getParallelExecNum();
for (UninstancedScanSource uninstancedScanSource : workerToScanRanges.values()) {
ScanSource scanSource = uninstancedScanSource.scanSource;
if (scanSource instanceof BucketScanSource) {
BucketScanSource bucketScanSource = (BucketScanSource) scanSource;
int bucketNum = bucketScanSource.bucketIndexToScanNodeToTablets.size();
if (bucketNum >= parallelExecNum) {
return false;
}
}
}
return true;
}
protected List<AssignedJob> fillUpSingleEmptyInstance(DistributedPlanWorkerManager workerManager) {
return ImmutableList.of(
assignWorkerAndDataSources(0,
ConnectContext.get().nextInstanceId(),
workerManager.randomAvailableWorker(),
DefaultScanSource.empty())
);
}
}