be/src/exec/sink/writer/vtablet_writer.h

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once
#include <brpc/controller.h>
#include <bthread/types.h>
#include <butil/errno.h>
#include <fmt/format.h>
#include <gen_cpp/Exprs_types.h>
#include <gen_cpp/FrontendService.h>
#include <gen_cpp/FrontendService_types.h>
#include <gen_cpp/PaloInternalService_types.h>
#include <gen_cpp/Types_types.h>
#include <gen_cpp/internal_service.pb.h>
#include <gen_cpp/types.pb.h>
#include <glog/logging.h>
#include <google/protobuf/stubs/callback.h>

// IWYU pragma: no_include <bits/chrono.h>
#include <bthread/condition_variable.h>
#include <bthread/mutex.h>

#include <atomic>
#include <chrono> // IWYU pragma: keep
#include <cstddef>
#include <cstdint>
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <ostream>
#include <queue>
#include <sstream>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "common/config.h"
#include "common/status.h"
#include "core/block/block.h"
#include "core/column/column.h"
#include "core/data_type/data_type.h"
#include "exec/sink/vrow_distribution.h"
#include "exec/sink/vtablet_block_convertor.h"
#include "exec/sink/vtablet_finder.h"
#include "exec/sink/writer/async_result_writer.h"
#include "exprs/vexpr_fwd.h"
#include "runtime/exec_env.h"
#include "runtime/memory/mem_tracker.h"
#include "runtime/runtime_profile.h"
#include "runtime/thread_context.h"
#include "storage/tablet_info.h"
#include "util/brpc_closure.h"
#include "util/stopwatch.hpp"

namespace doris {
class ObjectPool;
class RowDescriptor;
class RuntimeState;
class TDataSink;
class TExpr;
class Thread;
class ThreadPoolToken;
class TupleDescriptor;

// The counter of add_batch rpc of a single node
struct AddBatchCounter {
    // total execution time of a add_batch rpc
    int64_t add_batch_execution_time_us = 0;
    // lock waiting time in a add_batch rpc
    int64_t add_batch_wait_execution_time_us = 0;
    // number of add_batch call
    int64_t add_batch_num = 0;
    // time passed between marked close and finish close
    int64_t close_wait_time_ms = 0;

    AddBatchCounter& operator+=(const AddBatchCounter& rhs) {
        add_batch_execution_time_us += rhs.add_batch_execution_time_us;
        add_batch_wait_execution_time_us += rhs.add_batch_wait_execution_time_us;
        add_batch_num += rhs.add_batch_num;
        close_wait_time_ms += rhs.close_wait_time_ms;
        return *this;
    }
    friend AddBatchCounter operator+(const AddBatchCounter& lhs, const AddBatchCounter& rhs) {
        AddBatchCounter sum = lhs;
        sum += rhs;
        return sum;
    }
};

struct WriteBlockCallbackContext {
    std::atomic<bool> _is_last_rpc {false};
};

// It's very error-prone to guarantee the handler capture vars' & this closure's destruct sequence.
// So using create() to get the closure pointer is recommended. We can delete the closure ptr before the capture vars destruction.
// Delete this point is safe, don't worry about RPC callback will run after WriteBlockCallback deleted.
// "Ping-Pong" between sender and receiver, `try_set_in_flight` when send, `clear_in_flight` after rpc failure or callback,
// then next send will start, and it will wait for the rpc callback to complete when it is destroyed.
template <typename T>
class WriteBlockCallback final : public ::doris::DummyBrpcCallback<T> {
    ENABLE_FACTORY_CREATOR(WriteBlockCallback);

public:
    WriteBlockCallback() : cid(INVALID_BTHREAD_ID) {}
    ~WriteBlockCallback() override = default;

    void addFailedHandler(const std::function<void(const WriteBlockCallbackContext&)>& fn) {
        failed_handler = fn;
    }
    void addSuccessHandler(
            const std::function<void(const T&, const WriteBlockCallbackContext&)>& fn) {
        success_handler = fn;
    }

    void join() override {
        // We rely on in_flight to assure one rpc is running,
        // while cid is not reliable due to memory order.
        // in_flight is written before getting callid,
        // so we can not use memory fence to synchronize.
        while (_packet_in_flight) {
            // cid here is complicated
            if (cid != INVALID_BTHREAD_ID) {
                // actually cid may be the last rpc call id.
                brpc::Join(cid);
            }
            if (_packet_in_flight) {
                std::this_thread::sleep_for(std::chrono::milliseconds(10));
            }
        }
    }

    // plz follow this order: reset() -> set_in_flight() -> send brpc batch
    void reset() {
        ::doris::DummyBrpcCallback<T>::cntl_->Reset();
        cid = ::doris::DummyBrpcCallback<T>::cntl_->call_id();
    }

    // if _packet_in_flight == false, set it to true. Return true.
    // if _packet_in_flight == true, Return false.
    bool try_set_in_flight() {
        bool value = false;
        return _packet_in_flight.compare_exchange_strong(value, true);
    }

    void clear_in_flight() { _packet_in_flight = false; }

    bool is_packet_in_flight() { return _packet_in_flight; }

    void end_mark() {
        DCHECK(_ctx._is_last_rpc == false);
        _ctx._is_last_rpc = true;
    }

    void call() override {
        DCHECK(_packet_in_flight);
        if (::doris::DummyBrpcCallback<T>::cntl_->Failed()) {
            LOG(WARNING) << "failed to send brpc batch, error="
                         << berror(::doris::DummyBrpcCallback<T>::cntl_->ErrorCode())
                         << ", error_text=" << ::doris::DummyBrpcCallback<T>::cntl_->ErrorText();
            failed_handler(_ctx);
        } else {
            success_handler(*(::doris::DummyBrpcCallback<T>::response_), _ctx);
        }
        clear_in_flight();
    }

private:
    brpc::CallId cid;
    std::atomic<bool> _packet_in_flight {false};
    WriteBlockCallbackContext _ctx;
    std::function<void(const WriteBlockCallbackContext&)> failed_handler;
    std::function<void(const T&, const WriteBlockCallbackContext&)> success_handler;
};

class IndexChannel;
class VTabletWriter;

class VNodeChannelStat {
public:
    VNodeChannelStat& operator+=(const VNodeChannelStat& stat) {
        mem_exceeded_block_ns += stat.mem_exceeded_block_ns;
        where_clause_ns += stat.where_clause_ns;
        append_node_channel_ns += stat.append_node_channel_ns;
        return *this;
    };

    int64_t mem_exceeded_block_ns = 0;
    int64_t where_clause_ns = 0;
    int64_t append_node_channel_ns = 0;
};

struct WriterStats {
    int64_t serialize_batch_ns = 0;
    int64_t queue_push_lock_ns = 0;
    int64_t actual_consume_ns = 0;
    int64_t total_add_batch_exec_time_ns = 0;
    int64_t max_add_batch_exec_time_ns = 0;
    int64_t total_wait_exec_time_ns = 0;
    int64_t max_wait_exec_time_ns = 0;
    int64_t total_add_batch_num = 0;
    int64_t num_node_channels = 0;
    int64_t load_back_pressure_version_time_ms = 0;
    VNodeChannelStat channel_stat;
};

struct Payload {
    std::unique_ptr<IColumn::Selector> row_ids;
    RowPartTabletIds* row_part_tablet_ids = nullptr;
    std::vector<uint32_t> route_idxs;
};

// every NodeChannel keeps a data transmission channel with one BE. for multiple times open, it has a dozen of requests and corresponding closures.
class VNodeChannel {
public:
    VNodeChannel(VTabletWriter* parent, IndexChannel* index_channel, int64_t node_id,
                 bool is_incremental = false);

    ~VNodeChannel();

    // called before open, used to add tablet located in this backend. called by IndexChannel::init
    void add_tablet(const TTabletWithPartition& tablet) { _tablets_wait_open.emplace_back(tablet); }
    std::string debug_tablets() const {
        std::stringstream ss;
        for (const auto& tab : _all_tablets) {
            tab.printTo(ss);
            ss << '\n';
        }
        return ss.str();
    }

    void add_slave_tablet_nodes(int64_t tablet_id, const std::vector<int64_t>& slave_nodes) {
        _slave_tablet_nodes[tablet_id] = slave_nodes;
    }

    // this function is NON_REENTRANT
    Status init(RuntimeState* state);
    /// these two functions will call open_internal. should keep that clear --- REENTRANT
    // build corresponding connect to BE. NON-REENTRANT
    void open();
    // for auto partition, we use this to open more tablet. KEEP IT REENTRANT
    void incremental_open();
    // this will block until all request transmission which were opened or incremental opened finished.
    // this function will called multi times. NON_REENTRANT
    Status open_wait();

    Status add_block(Block* block, const Payload* payload);

    // @return: 1 if running, 0 if finished.
    // @caller: VOlapTabletSink::_send_batch_process. it's a continual asynchronous process.
    int try_send_and_fetch_status(RuntimeState* state,
                                  std::unique_ptr<ThreadPoolToken>& thread_pool_token);
    // when there's pending block found by try_send_and_fetch_status(), we will awake a thread to send it.
    void try_send_pending_block(RuntimeState* state);

    void clear_all_blocks();

    // two ways to stop channel:
    // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response.
    // 2. just cancel()
    // hang_wait = true will make reciever hang until all sender mark_closed.
    void mark_close(bool hang_wait = false);

    bool is_closed() const { return _is_closed; }
    bool is_cancelled() const { return _cancelled; }
    std::string get_cancel_msg() {
        std::lock_guard<std::mutex> l(_cancel_msg_lock);
        if (!_cancel_msg.empty()) {
            return _cancel_msg;
        }
        return fmt::format("{} is cancelled", channel_info());
    }

    // two ways to stop channel:
    // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response.
    // 2. just cancel()
    Status close_wait(RuntimeState* state, bool* is_closed);

    Status after_close_handle(
            RuntimeState* state, WriterStats* writer_stats,
            std::unordered_map<int64_t, AddBatchCounter>* node_add_batch_counter_map);

    Status check_status();

    void cancel(const std::string& cancel_msg);

    void time_report(std::unordered_map<int64_t, AddBatchCounter>* add_batch_counter_map,
                     WriterStats* writer_stats) const {
        if (add_batch_counter_map != nullptr) {
            (*add_batch_counter_map)[_node_id] += _add_batch_counter;
            (*add_batch_counter_map)[_node_id].close_wait_time_ms = _close_time_ms;
        }
        if (writer_stats != nullptr) {
            writer_stats->serialize_batch_ns += _serialize_batch_ns;
            writer_stats->channel_stat += _stat;
            writer_stats->queue_push_lock_ns += _queue_push_lock_ns;
            writer_stats->actual_consume_ns += _actual_consume_ns;
            writer_stats->total_add_batch_exec_time_ns +=
                    (_add_batch_counter.add_batch_execution_time_us * 1000);
            writer_stats->total_wait_exec_time_ns +=
                    (_add_batch_counter.add_batch_wait_execution_time_us * 1000);
            writer_stats->total_add_batch_num += _add_batch_counter.add_batch_num;
            writer_stats->load_back_pressure_version_time_ms +=
                    _load_back_pressure_version_block_ms;
        }
    }

    int64_t node_id() const { return _node_id; }
    std::string host() const { return _node_info.host; }
    std::string name() const { return _name; }

    std::string channel_info() const {
        return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host,
                           _node_info.brpc_port);
    }

    size_t get_pending_bytes() { return _pending_batches_bytes; }

    bool is_incremental() const { return _is_incremental; }

    int64_t write_bytes() const { return _write_bytes.load(); }

protected:
    // make a real open request for relative BE's load channel.
    void _open_internal(bool is_incremental);
    void _set_adaptive_random_bucket_open_request(PTabletWriterOpenRequest* request);

    void _close_check();
    void _cancel_with_msg(const std::string& msg);

    void _add_block_success_callback(const PTabletWriterAddBlockResult& result,
                                     const WriteBlockCallbackContext& ctx);
    void _add_block_failed_callback(const WriteBlockCallbackContext& ctx);

    void _refresh_back_pressure_version_wait_time(
            const ::google::protobuf::RepeatedPtrField<::doris::PTabletLoadRowsetInfo>&
                    tablet_load_infos);

    VTabletWriter* _parent = nullptr;
    IndexChannel* _index_channel = nullptr;
    int64_t _node_id = -1;
    std::string _load_info;
    std::string _name;

    std::shared_ptr<MemTracker> _node_channel_tracker;
    int64_t _load_mem_limit = -1;

    TupleDescriptor* _tuple_desc = nullptr;
    NodeInfo _node_info;

    // this should be set in init() using config
    int _rpc_timeout_ms = 60000;
    int64_t _next_packet_seq = 0;
    MonotonicStopWatch _timeout_watch;

    // the timestamp when this node channel be marked closed and finished closed
    uint64_t _close_time_ms = 0;

    // user cancel or get some errors
    std::atomic<bool> _cancelled {false};
    std::mutex _cancel_msg_lock;
    std::string _cancel_msg;

    // send finished means the consumer thread which send the rpc can exit
    std::atomic<bool> _send_finished {false};

    // add batches finished means the last rpc has be response, used to check whether this channel can be closed
    std::atomic<bool> _add_batches_finished {false}; // reuse for vectorized

    bool _eos_is_produced {false}; // only for restricting producer behaviors

    std::unique_ptr<RowDescriptor> _row_desc;
    int _batch_size = 0;

    // limit _pending_batches size
    std::atomic<size_t> _pending_batches_bytes {0};
    size_t _max_pending_batches_bytes {(size_t)config::nodechannel_pending_queue_max_bytes};
    std::mutex _pending_batches_lock;          // reuse for vectorized
    std::atomic<int> _pending_batches_num {0}; // reuse for vectorized

    std::shared_ptr<PBackendService_Stub> _stub;
    // because we have incremantal open, we should keep one relative closure for one request. it's similarly for adding block.
    std::vector<std::shared_ptr<DummyBrpcCallback<PTabletWriterOpenResult>>> _open_callbacks;

    std::vector<TTabletWithPartition> _all_tablets;
    std::vector<TTabletWithPartition> _tablets_wait_open;
    // For rolling-upgrade compatibility, adaptive random bucket add-block RPCs also carry
    // tablet_ids. New receivers ignore them and route by partition id, while old receivers use
    // this local tablet id instead of failing on an empty tablet_ids list.
    std::unordered_map<int64_t, int64_t> _adaptive_partition_compat_tablets;
    // map from tablet_id to node_id where slave replicas locate in
    std::unordered_map<int64_t, std::vector<int64_t>> _slave_tablet_nodes;
    std::vector<TTabletCommitInfo> _tablet_commit_infos;

    AddBatchCounter _add_batch_counter;
    std::atomic<int64_t> _serialize_batch_ns {0};
    std::atomic<int64_t> _queue_push_lock_ns {0};
    std::atomic<int64_t> _actual_consume_ns {0};
    std::atomic<int64_t> _load_back_pressure_version_block_ms {0};

    VNodeChannelStat _stat;
    // lock to protect _is_closed.
    // The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel.
    // However, this rpc callback may occur after the whole task is finished (e.g. due to network latency),
    // and by that time the IndexChannel may have been destructured, so we should not call the
    // IndexChannel methods anymore, otherwise the BE will crash.
    // Therefore, we use the _is_closed and _closed_lock to ensure that the RPC callback
    // function will not call the IndexChannel method after the NodeChannel is closed.
    // The IndexChannel is definitely accessible until the NodeChannel is closed.
    std::mutex _closed_lock;
    bool _is_closed = false;
    bool _inited = false;

    RuntimeState* _state = nullptr;
    // A context lock for callbacks, the callback has to lock the ctx, to avoid
    // the object is deleted during callback is running.
    std::weak_ptr<TaskExecutionContext> _task_exec_ctx;
    // rows number received per tablet, tablet_id -> rows_num
    std::vector<std::pair<int64_t, int64_t>> _tablets_received_rows;
    // rows number filtered per tablet, tablet_id -> filtered_rows_num
    std::vector<std::pair<int64_t, int64_t>> _tablets_filtered_rows;

    // build a _cur_mutable_block and push into _pending_blocks. when not building, this block is empty.
    std::unique_ptr<MutableBlock> _cur_mutable_block;
    std::shared_ptr<PTabletWriterAddBlockRequest> _cur_add_block_request;

    using AddBlockReq =
            std::pair<std::unique_ptr<MutableBlock>, std::shared_ptr<PTabletWriterAddBlockRequest>>;
    std::queue<AddBlockReq> _pending_blocks;
    // send block to slave BE rely on this. dont reconstruct it.
    std::shared_ptr<WriteBlockCallback<PTabletWriterAddBlockResult>> _send_block_callback = nullptr;

    int64_t _wg_id = -1;

    bool _is_incremental;

    std::atomic<int64_t> _write_bytes {0};
    std::atomic<int64_t> _load_back_pressure_version_wait_time_ms {0};
};

// an IndexChannel is related to specific table and its rollup and mv
class IndexChannel {
public:
    IndexChannel(VTabletWriter* parent, int64_t index_id, VExprContextSPtr where_clause)
            : _parent(parent), _index_id(index_id), _where_clause(std::move(where_clause)) {
        _index_channel_tracker =
                std::make_unique<MemTracker>("IndexChannel:indexID=" + std::to_string(_index_id));
    }
    ~IndexChannel() = default;

    // allow to init multi times, for incremental open more tablets for one index(table)
    Status init(RuntimeState* state, const std::vector<TTabletWithPartition>& tablets,
                bool incremental = false);

    void for_each_node_channel(
            const std::function<void(const std::shared_ptr<VNodeChannel>&)>& func) {
        for (auto& it : _node_channels) {
            func(it.second);
        }
    }

    void for_init_node_channel(
            const std::function<void(const std::shared_ptr<VNodeChannel>&)>& func) {
        for (auto& it : _node_channels) {
            if (!it.second->is_incremental()) {
                func(it.second);
            }
        }
    }

    void for_inc_node_channel(
            const std::function<void(const std::shared_ptr<VNodeChannel>&)>& func) {
        for (auto& it : _node_channels) {
            if (it.second->is_incremental()) {
                func(it.second);
            }
        }
    }

    std::unordered_set<int64_t> init_node_channel_ids() {
        std::unordered_set<int64_t> node_channel_ids;
        for (auto& it : _node_channels) {
            if (!it.second->is_incremental()) {
                node_channel_ids.insert(it.first);
            }
        }
        return node_channel_ids;
    }

    std::unordered_set<int64_t> inc_node_channel_ids() {
        std::unordered_set<int64_t> node_channel_ids;
        for (auto& it : _node_channels) {
            if (it.second->is_incremental()) {
                node_channel_ids.insert(it.first);
            }
        }
        return node_channel_ids;
    }

    std::unordered_set<int64_t> each_node_channel_ids() {
        std::unordered_set<int64_t> node_channel_ids;
        for (auto& it : _node_channels) {
            node_channel_ids.insert(it.first);
        }
        return node_channel_ids;
    }

    bool has_incremental_node_channel() const { return _has_inc_node; }

    void mark_as_failed(const VNodeChannel* node_channel, const std::string& err,
                        int64_t tablet_id = -1);
    Status check_intolerable_failure();

    Status close_wait(RuntimeState* state, WriterStats* writer_stats,
                      std::unordered_map<int64_t, AddBatchCounter>* node_add_batch_counter_map,
                      std::unordered_set<int64_t> unfinished_node_channel_ids,
                      bool need_wait_after_quorum_success);

    int64_t close_wait_version() const {
        return _close_wait_version.load(std::memory_order_acquire);
    }

    void wait_for_close_event(int64_t observed_version, int64_t timeout_ms);

    void notify_close_wait();

    Status check_each_node_channel_close(
            std::unordered_set<int64_t>* unfinished_node_channel_ids,
            std::unordered_map<int64_t, AddBatchCounter>* node_add_batch_counter_map,
            WriterStats* writer_stats, Status status);

    // set error tablet info in runtime state, so that it can be returned to FE.
    void set_error_tablet_in_state(RuntimeState* state);

    size_t num_node_channels() const { return _node_channels.size(); }

    size_t get_pending_bytes() const {
        size_t mem_consumption = 0;
        for (const auto& kv : _node_channels) {
            mem_consumption += kv.second->get_pending_bytes();
        }
        return mem_consumption;
    }

    void set_tablets_received_rows(
            const std::vector<std::pair<int64_t, int64_t>>& tablets_received_rows, int64_t node_id);

    void set_tablets_filtered_rows(
            const std::vector<std::pair<int64_t, int64_t>>& tablets_filtered_rows, int64_t node_id);

    int64_t num_rows_filtered() {
        // the Unique table has no roll up or materilized view
        // we just add up filtered rows from all partitions
        return std::accumulate(_tablets_filtered_rows.cbegin(), _tablets_filtered_rows.cend(), 0,
                               [](int64_t sum, const auto& a) { return sum + a.second[0].second; });
    }

    // check whether the rows num written by different replicas is consistent
    Status check_tablet_received_rows_consistency();

    // check whether the rows num filtered by different replicas is consistent
    Status check_tablet_filtered_rows_consistency();

    void set_start_time(const int64_t& start_time) { _start_time = start_time; }

    VExprContextSPtr get_where_clause() { return _where_clause; }

private:
    friend class VNodeChannel;
    friend class VTabletWriter;
    friend class VRowDistribution;

    int _max_failed_replicas(int64_t tablet_id);

    int _load_required_replicas_num(int64_t tablet_id);

    bool _quorum_success(const std::unordered_set<int64_t>& unfinished_node_channel_ids,
                         const std::unordered_set<int64_t>& need_finish_tablets);

    int64_t _calc_max_wait_time_ms(const std::unordered_set<int64_t>& unfinished_node_channel_ids);

    VTabletWriter* _parent = nullptr;
    int64_t _index_id;
    VExprContextSPtr _where_clause;

    // from backend channel to tablet_id
    // ATTN: must be placed before `_node_channels` and `_channels_by_tablet`.
    // Because the destruct order of objects is opposite to the creation order.
    // So NodeChannel will be destructured first.
    // And the destructor function of NodeChannel waits for all RPCs to finish.
    // This ensures that it is safe to use `_tablets_by_channel` in the callback function for the end of the RPC.
    std::unordered_map<int64_t, std::unordered_set<int64_t>> _tablets_by_channel;
    // BeId -> channel
    std::unordered_map<int64_t, std::shared_ptr<VNodeChannel>> _node_channels;
    // from tablet_id to backend channel
    std::unordered_map<int64_t, std::vector<std::shared_ptr<VNodeChannel>>> _channels_by_tablet;
    // from partition_id to FE-planned bucket owner channel in cloud adaptive random bucket mode
    std::unordered_map<int64_t, std::shared_ptr<VNodeChannel>> _channels_by_partition;
    bool _has_inc_node = false;

    // lock to protect _failed_channels and _failed_channels_msgs
    mutable std::mutex _fail_lock;
    // key is tablet_id, value is a set of failed node id
    std::unordered_map<int64_t, std::unordered_set<int64_t>> _failed_channels;
    // key is tablet_id, value is error message
    std::unordered_map<int64_t, std::string> _failed_channels_msgs;
    Status _intolerable_failure_status = Status::OK();

    std::unique_ptr<MemTracker> _index_channel_tracker;
    // rows num received by DeltaWriter per tablet, tablet_id -> <node_Id, rows_num>
    // used to verify whether the rows num received by different replicas is consistent
    std::map<int64_t, std::vector<std::pair<int64_t, int64_t>>> _tablets_received_rows;

    // rows num filtered by DeltaWriter per tablet, tablet_id -> <node_Id, filtered_rows_num>
    // used to verify whether the rows num filtered by different replicas is consistent
    std::map<int64_t, std::vector<std::pair<int64_t, int64_t>>> _tablets_filtered_rows;

    int64_t _start_time = 0;

    std::atomic<int64_t> _close_wait_version {0};
    bthread::Mutex _close_wait_mutex;
    bthread::ConditionVariable _close_wait_cv;
};
} // namespace doris

namespace doris {
//
// write result to file
class VTabletWriter final : public AsyncResultWriter {
public:
    VTabletWriter(const TDataSink& t_sink, const VExprContextSPtrs& output_exprs,
                  std::shared_ptr<Dependency> dep, std::shared_ptr<Dependency> fin_dep);

    Status write(RuntimeState* state, Block& block) override;

    Status close(Status) override;

    Status open(RuntimeState* state, RuntimeProfile* profile) override;

    // the consumer func of sending pending batches in every NodeChannel.
    // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending.
    // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer
    void _send_batch_process();

    Status on_partitions_created(TCreatePartitionResult* result);

    Status _send_new_partition_batch();

private:
    friend class VNodeChannel;
    friend class IndexChannel;

    using ChannelDistributionPayload = std::unordered_map<VNodeChannel*, Payload>;
    using ChannelDistributionPayloadVec = std::vector<std::unordered_map<VNodeChannel*, Payload>>;

    Status _init_row_distribution();

    Status _init(RuntimeState* state, RuntimeProfile* profile);

    Status _generate_one_index_channel_payload(RowPartTabletIds& row_part_tablet_tuple,
                                               int32_t index_idx,
                                               ChannelDistributionPayload& channel_payload);

    Status _generate_index_channels_payloads(std::vector<RowPartTabletIds>& row_part_tablet_ids,
                                             ChannelDistributionPayloadVec& payload);

    void _cancel_all_channel(Status status);

    Status _incremental_open_node_channel(const std::vector<TOlapTablePartition>& partitions);

    void _do_try_close(RuntimeState* state, const Status& exec_status);

    void _build_tablet_replica_info(const int64_t tablet_id, VOlapTablePartition* partition);

    TDataSink _t_sink;

    std::shared_ptr<MemTracker> _mem_tracker;

    ObjectPool* _pool = nullptr;

    bthread_t _sender_thread = 0;

    // unique load id
    PUniqueId _load_id;
    int64_t _txn_id = -1;
    int _num_replicas = -1;
    int _tuple_desc_id = -1;

    // this is tuple descriptor of destination OLAP table
    TupleDescriptor* _output_tuple_desc = nullptr;
    RowDescriptor* _output_row_desc = nullptr;

    // number of senders used to insert into OlapTable, if we only support single node insert,
    // all data from select should collectted and then send to OlapTable.
    // To support multiple senders, we maintain a channel for each sender.
    int _sender_id = -1;
    int _num_senders = -1;
    bool _is_high_priority = false;

    // TODO(zc): think about cache this data
    std::shared_ptr<OlapTableSchemaParam> _schema;
    OlapTableLocationParam* _location = nullptr;
    bool _write_single_replica = false;
    OlapTableLocationParam* _slave_location = nullptr;
    DorisNodesInfo* _nodes_info = nullptr;

    std::unique_ptr<OlapTabletFinder> _tablet_finder;

    // index_channel
    bthread::Mutex _stop_check_channel;
    std::vector<std::shared_ptr<IndexChannel>> _channels;
    std::unordered_map<int64_t, std::shared_ptr<IndexChannel>> _index_id_to_channel;

    std::unique_ptr<ThreadPoolToken> _send_batch_thread_pool_token;

    // support only one partition column now
    std::vector<std::vector<TStringLiteral>> _partitions_need_create;

    std::unique_ptr<OlapTableBlockConvertor> _block_convertor;
    // Stats for this
    int64_t _send_data_ns = 0;
    int64_t _number_input_rows = 0;
    int64_t _number_output_rows = 0;
    int64_t _filter_ns = 0;

    MonotonicStopWatch _row_distribution_watch;

    RuntimeProfile::Counter* _input_rows_counter = nullptr;
    RuntimeProfile::Counter* _output_rows_counter = nullptr;
    RuntimeProfile::Counter* _filtered_rows_counter = nullptr;
    RuntimeProfile::Counter* _send_data_timer = nullptr;
    RuntimeProfile::Counter* _row_distribution_timer = nullptr;
    RuntimeProfile::Counter* _append_node_channel_timer = nullptr;
    RuntimeProfile::Counter* _filter_timer = nullptr;
    RuntimeProfile::Counter* _where_clause_timer = nullptr;
    RuntimeProfile::Counter* _add_partition_request_timer = nullptr;
    RuntimeProfile::Counter* _wait_mem_limit_timer = nullptr;
    RuntimeProfile::Counter* _validate_data_timer = nullptr;
    RuntimeProfile::Counter* _open_timer = nullptr;
    RuntimeProfile::Counter* _close_timer = nullptr;
    RuntimeProfile::Counter* _non_blocking_send_timer = nullptr;
    RuntimeProfile::Counter* _non_blocking_send_work_timer = nullptr;
    RuntimeProfile::Counter* _serialize_batch_timer = nullptr;
    RuntimeProfile::Counter* _total_add_batch_exec_timer = nullptr;
    RuntimeProfile::Counter* _max_add_batch_exec_timer = nullptr;
    RuntimeProfile::Counter* _total_wait_exec_timer = nullptr;
    RuntimeProfile::Counter* _max_wait_exec_timer = nullptr;
    RuntimeProfile::Counter* _add_batch_number = nullptr;
    RuntimeProfile::Counter* _num_node_channels = nullptr;
    RuntimeProfile::Counter* _load_back_pressure_version_time_ms = nullptr;

    // the timeout of load channels opened by this tablet sink. in second
    int64_t _load_channel_timeout_s = 0;
    // the load txn absolute expiration time.
    int64_t _txn_expiration = 0;

    int32_t _send_batch_parallelism = 1;
    // Save the status of try_close() and close() method
    Status _close_status;
    // if we called try_close(), for auto partition the periodic send thread should stop if it's still waiting for node channels first-time open.
    // atomic: written by pthread (_do_try_close), read by bthread (_send_batch_process)
    std::atomic<bool> _try_close {false};
    bool _inited = false;
    bool _write_file_cache = false;

    // User can change this config at runtime, avoid it being modified during query or loading process.
    bool _transfer_large_data_by_brpc = false;

    VOlapTablePartitionParam* _vpartition = nullptr;

    RuntimeState* _state = nullptr; // not owned, set when open

    VRowDistribution _row_distribution;
    // reuse to avoid frequent memory allocation and release.
    std::vector<RowPartTabletIds> _row_part_tablet_ids;

    // tablet_id -> <total replicas num, load required replicas num>
    std::unordered_map<int64_t, std::pair<int, int>> _tablet_replica_info;

    // tablet_id -> set of backend_ids that have version gaps
    // these backends' success should not be counted for majority write
    std::unordered_map<int64_t, std::unordered_set<int64_t>> _tablet_version_gap_backends;
};
} // namespace doris

Coverage Report

Created: 2026-06-29 23:23