ColocationGroupProcDir.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.common.proc;

import org.apache.doris.catalog.ColocateTableIndex;
import org.apache.doris.catalog.ColocateTableIndex.GroupId;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
import org.apache.doris.catalog.Replica;
import org.apache.doris.catalog.Table;
import org.apache.doris.catalog.Tablet;
import org.apache.doris.cloud.system.CloudSystemInfoService;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
import org.apache.doris.resource.Tag;
import org.apache.doris.system.Backend;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/*
 * show proc "/colocation_group";
 */
public class ColocationGroupProcDir implements ProcDirInterface {
    public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
            .add("GroupId").add("GroupName").add("TableIds")
            .add("BucketsNum").add("ReplicaAllocation").add("DistCols").add("IsStable")
            .add("ErrorMsg").build();

    @Override
    public boolean register(String name, ProcNodeInterface node) {
        return false;
    }

    @Override
    public ProcNodeInterface lookup(String groupIdStr) throws AnalysisException {
        String[] parts = groupIdStr.split("\\.");
        if (parts.length != 2) {
            throw new AnalysisException("Invalid group id: " + groupIdStr);
        }

        long dbId = -1;
        long grpId = -1;
        try {
            dbId = Long.valueOf(parts[0]);
            grpId = Long.valueOf(parts[1]);
        } catch (NumberFormatException e) {
            throw new AnalysisException("Invalid group id: " + groupIdStr);
        }

        GroupId groupId = new GroupId(dbId, grpId);
        ColocateTableIndex index = Env.getCurrentColocateIndex();
        Map<Tag, List<List<Long>>> beSeqs = index.getBackendsPerBucketSeq(groupId);
        Map<String, List<List<Long>>> columns;
        if ((beSeqs == null || beSeqs.isEmpty()) && Config.isCloudMode()) {
            // In cloud mode, legacy backend sequence metadata may be empty. Derive the
            // sequence from current tablets, one column per compute group. This path must
            // not resolve cloud backends in a way that auto-starts a compute group.
            columns = getCloudBackendSeqsFromTablets(groupId, index);
        } else {
            // Local mode: one column per resource tag.
            columns = Maps.newLinkedHashMap();
            if (beSeqs != null) {
                for (Map.Entry<Tag, List<List<Long>>> entry : beSeqs.entrySet()) {
                    columns.put(entry.getKey().toString(), entry.getValue());
                }
            }
        }
        return new ColocationGroupBackendSeqsProcNode(columns);
    }

    @Override
    public ProcResult fetchResult() throws AnalysisException {
        BaseProcResult result = new BaseProcResult();
        result.setNames(TITLE_NAMES);

        ColocateTableIndex index = Env.getCurrentColocateIndex();
        List<List<String>> infos = index.getInfos();
        result.setRows(infos);
        return result;
    }

    private Map<String, List<List<Long>>> getCloudBackendSeqsFromTablets(GroupId groupId, ColocateTableIndex index) {
        Map<String, List<List<Long>>> backendsSeq = Maps.newLinkedHashMap();
        List<Long> tableIds = index.getAllTableIds(groupId);
        for (Long tableId : tableIds) {
            long dbId = groupId.dbId;
            if (dbId == 0) {
                Long tableDbId = index.getDbIdByTblIdNullable(groupId, tableId);
                if (tableDbId == null) {
                    continue;
                }
                dbId = tableDbId;
            }
            Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId);
            if (db == null) {
                continue;
            }
            Table table = db.getTableNullable(tableId);
            if (!(table instanceof OlapTable)) {
                continue;
            }
            backendsSeq = getCloudBackendSeqsFromTable((OlapTable) table);
            if (!backendsSeq.isEmpty()) {
                return backendsSeq;
            }
        }
        return backendsSeq;
    }

    private Map<String, List<List<Long>>> getCloudBackendSeqsFromTable(OlapTable olapTable) {
        // Snapshot replicas (ordered by bucket) under the table lock only. Resolving the
        // per-compute-group placement of colocate cloud replicas calls into
        // CloudSystemInfoService / the colocate index, which must run outside the table
        // lock to avoid nested lock acquisition.
        List<List<Replica>> bucketReplicas = Lists.newArrayList();
        olapTable.readLock();
        try {
            Partition firstPartition = null;
            for (Partition partition : olapTable.getAllPartitions()) {
                firstPartition = partition;
                break;
            }
            if (firstPartition == null) {
                return Maps.newLinkedHashMap();
            }
            MaterializedIndex baseIndex = firstPartition.getBaseIndex();
            for (Tablet tablet : baseIndex.getTablets()) {
                bucketReplicas.add(new ArrayList<>(tablet.getReplicas()));
            }
        } finally {
            olapTable.readUnlock();
        }

        // Resolve each replica's per-compute-group placement outside the table lock. In
        // cloud mode a replica is hashed to a different BE in each compute group, so build
        // a separate bucket sequence per compute group. Merging across groups (picking an
        // arbitrary first BE) would mix BEs from different compute groups into one bucket
        // sequence, which is meaningless. For colocate cloud tables placement is computed
        // on the fly; otherwise it comes from the cached clusterId -> backendId map (or an
        // empty scope key for local-style replicas).
        List<List<Map<String, Long>>> tabletReplicaBackends = Lists.newArrayListWithCapacity(bucketReplicas.size());
        Set<String> scopeKeys = Sets.newLinkedHashSet();
        // Shared across all replicas in this proc call so each compute group's backend
        // list is fetched only once (colocate placement is resolved per compute group).
        Map<String, List<Backend>> computeGroupBackendCache = Maps.newHashMap();
        for (List<Replica> replicas : bucketReplicas) {
            List<Map<String, Long>> replicaBackends = new ArrayList<>();
            for (Replica replica : replicas) {
                Map<String, Long> clusterToBackend =
                        replica.getClusterToBackendForProcDisplay(computeGroupBackendCache);
                replicaBackends.add(clusterToBackend);
                scopeKeys.addAll(clusterToBackend.keySet());
            }
            tabletReplicaBackends.add(replicaBackends);
        }

        Map<String, List<List<Long>>> seqByScopeKey = Maps.newLinkedHashMap();
        for (String scopeKey : scopeKeys) {
            List<List<Long>> bucketSeq = Lists.newArrayListWithCapacity(tabletReplicaBackends.size());
            boolean hasBackend = false;
            for (List<Map<String, Long>> replicaBackends : tabletReplicaBackends) {
                List<Long> bucketBackends = new ArrayList<>();
                for (Map<String, Long> clusterToBackend : replicaBackends) {
                    Long backendId = clusterToBackend.get(scopeKey);
                    if (backendId == null || backendId < 0) {
                        continue;
                    }
                    bucketBackends.add(backendId);
                    hasBackend = true;
                }
                bucketSeq.add(bucketBackends);
            }
            if (hasBackend) {
                seqByScopeKey.put(scopeKey, bucketSeq);
            }
        }

        // Resolve scope keys to display column names (also outside the table lock): name
        // resolution acquires CloudSystemInfoService's lock.
        Map<String, List<List<Long>>> backendsSeq = Maps.newLinkedHashMap();
        for (Map.Entry<String, List<List<Long>>> entry : seqByScopeKey.entrySet()) {
            backendsSeq.put(scopeKeyToColumnName(entry.getKey()), entry.getValue());
        }
        return backendsSeq;
    }

    // Map a proc-display scope key to its column name. An empty key means there is no
    // per-compute-group breakdown (local-style replicas), shown as a single "BackendIds"
    // column. Otherwise the key is a cloud compute group id, shown by its compute group
    // name (falling back to the raw id when the name cannot be resolved).
    private String scopeKeyToColumnName(String scopeKey) {
        if (Strings.isNullOrEmpty(scopeKey)) {
            return "BackendIds";
        }
        try {
            String name = ((CloudSystemInfoService) Env.getCurrentSystemInfo())
                    .getClusterNameByClusterId(scopeKey);
            if (!Strings.isNullOrEmpty(name)) {
                return name;
            }
        } catch (Exception e) {
            // Fall back to the raw compute group id if name resolution is unavailable.
        }
        return scopeKey;
    }
}