Ceph 写流程的回调简述

2,057 阅读3分钟

TLDR:直接浏览第 7 节。

1. issue_repop

issue_repop 创建 C_OSD_RepopCommit 对象 on_all_commit,未来会在每个 osd 完成写入后调用。

void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx) {
    // ...
    Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
    // ...
    pgbackend->submit_transaction(
        soid,
        ctx->delta_stats,
        ctx->at_version,
        std::move(ctx->op_t),
        recovery_state.get_pg_trim_to(),
        recovery_state.get_min_last_complete_ondisk(),
        std::move(ctx->log),
        ctx->updated_hset_history,
        on_all_commit,
        repop->rep_tid,
        ctx->reqid,
        ctx->op);
}

2. submit_transaction

issue_repop 中调用 submit_transaction

在该函数中,创建 InProgressOp 实例 op,该实例主要在 issue_op 中用到。

在调用 issue_op 后,注册 C_OSD_OnOpCommit 回调,关于该回调见第 6 节。

struct InProgressOp : public RefCountedObject
{
    // ...
    Context *on_commit;
    // ...
    InProgressOp(ceph_tid_t tid, Context *on_commit, OpRequestRef op, eversion_t v)
        : tid(tid), on_commit(on_commit),
        op(op), v(v) {}
    // ...
};

void ReplicatedBackend::submit_transaction(
    const hobject_t &soid,
    const object_stat_sum_t &delta_stats,
    const eversion_t &at_version,
    PGTransactionUPtr &&_t,
    const eversion_t &trim_to,
    const eversion_t &min_last_complete_ondisk,
    vector<pg_log_entry_t> &&_log_entries,
    std::optional<pg_hit_set_history_t> &hset_history,
    Context *on_all_commit,
    ceph_tid_t tid,
    osd_reqid_t reqid,
    OpRequestRef orig_op)
{
    // ...
    auto insert_res = in_progress_ops.insert(
        make_pair(
            tid,
            ceph::make_ref<InProgressOp>(
                tid, on_all_commit,
                orig_op, at_version)));
            ceph_assert(insert_res.second);
            InProgressOp &op = *insert_res.first->second;

    op.waiting_for_commit.insert(
        parent->get_acting_recovery_backfill_shards().begin(),
        parent->get_acting_recovery_backfill_shards().end());

    issue_op(
        soid,
        at_version,
        tid,
        reqid,
        trim_to,
        min_last_complete_ondisk,
        added.size() ? *(added.begin()) : hobject_t(),
        removed.size() ? *(removed.begin()) : hobject_t(),
        log_entries,
        hset_history,
        &op,
        op_t);
      
    // ...
    op_t.register_on_commit(
        parent->bless_context(
        new C_OSD_OnOpCommit(this, &op)));
    // ...
}

3. issue_op

issue_op 中调用 generate_subop 生成 MOSDRepOp 的消息 wr(该消息 typeMSG_OSD_REPOP),接着 wr 发送至从 osd 上。

void ReplicatedBackend::issue_op(
    const hobject_t &soid,
    const eversion_t &at_version,
    ceph_tid_t tid,
    osd_reqid_t reqid,
    eversion_t pg_trim_to,
    eversion_t min_last_complete_ondisk,
    hobject_t new_temp_oid,
    hobject_t discard_temp_oid,
    const vector<pg_log_entry_t> &log_entries,
    std::optional<pg_hit_set_history_t> &hset_hist,
    InProgressOp *op,
    ObjectStore::Transaction &op_t)
{
    // ...
    for (const auto &shard : get_parent()->get_acting_recovery_backfill_shards())
    {
        if (shard == parent->whoami_shard())
            continue;
        // ...

        Message *wr;
        wr = generate_subop(
            soid,
            at_version,
            tid,
            reqid,
            pg_trim_to,
            min_last_complete_ondisk,
            new_temp_oid,
            discard_temp_oid,
            logs,
            hset_hist,
            op_t,
            shard,
            pinfo);

        // ...
        get_parent()->send_message_osd_cluster(
        shard.osd, wr, get_osdmap_epoch());
    }
}

Message *ReplicatedBackend::generate_subop(
    const hobject_t &soid,
    const eversion_t &at_version,
    ceph_tid_t tid,
    osd_reqid_t reqid,
    eversion_t pg_trim_to,
    eversion_t min_last_complete_ondisk,
    hobject_t new_temp_oid,
    hobject_t discard_temp_oid,
    const bufferlist &log_entries,
    std::optional<pg_hit_set_history_t> &hset_hist,
    ObjectStore::Transaction &op_t,
    pg_shard_t peer,
    const pg_info_t &pinfo)
{
    int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
    
        MOSDRepOp *wr = new MOSDRepOp(
        reqid, parent->whoami_shard(),
        spg_t(get_info().pgid.pgid, peer.shard),
        soid, acks_wanted,
        get_osdmap_epoch(),
        parent->get_last_peering_reset_epoch(),
        tid, at_version);
        
        // ...
}

class MOSDRepOp : public MOSDFastDispatchOp {
    // ...
    MOSDRepOp()
        : MOSDFastDispatchOp{MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION},
    // ...
    MOSDRepOp(osd_reqid_t r, pg_shard_t from,
        spg_t p, const hobject_t &po, int aw,
        epoch_t mape, epoch_t min_epoch, ceph_tid_t rtid, eversion_t v)
        : MOSDFastDispatchOp{MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION},
        // ...
    // ...
};

4. 处理消息

4.1 do_request & handle_message

osd 收到消息后,会进入到 handle_message 分支中进行处理。

void PrimaryLogPG::do_request(
    OpRequestRef &op,
    ThreadPool::TPHandle &handle)
{
    // ...
    if (pgbackend->handle_message(op))
        return;
    // ...
}

bool PGBackend::handle_message(OpRequestRef op)
{
    // ...
    return _handle_message(op);
}

bool ReplicatedBackend::_handle_message(OpRequestRef op)
{
    dout(10) << __func__ << ": " << op << dendl;
    switch (op->get_req()->get_type())
    {
    // ...
    case MSG_OSD_REPOP:
    {
        do_repop(op);
        return true;
    }

    case MSG_OSD_REPOPREPLY:
    {
        do_repop_reply(op);
        return true;
    }
  // ...
}

4.2 do_repop

do_repop 函数中,注册了 C_OSD_RepModifyCommit 回调,在从 osd 写入后,调用回调函数:repop_commit

repop_commit 会发送一个 MSG_OSD_REPOPREPLY 消息,参考 4.1 节,该消息会被 _handle_message 处理,并调用 do_repop_reply

struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context
{
    ReplicatedBackend *pg;
    RepModifyRef rm;
    C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r)
        : pg(pg), rm(r) {}
    void finish(int r) override
    {
        pg->repop_commit(rm);
    }
};

void ReplicatedBackend::do_repop(OpRequestRef op)
{
    // ...
    RepModifyRef rm(std::make_shared<RepModify>());
    rm->op = op;
    // ...
    rm->opt.register_on_commit(
        parent->bless_context(
            new C_OSD_RepModifyCommit(this, rm)));
    // ...
}

void ReplicatedBackend::repop_commit(RepModifyRef rm)
{
    // ...

    MOSDRepOpReply *reply = new MOSDRepOpReply(
        m,
        get_parent()->whoami_shard(),
        0, get_osdmap_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
    
    // ...
   
    get_parent()->send_message_osd_cluster(
        rm->ackerosd, reply, get_osdmap_epoch());

    // ...
}

class MOSDRepOpReply : public MOSDFastDispatchOp {
    // ...

    MOSDRepOpReply(
        const MOSDRepOp *req, pg_shard_t from, int result_, epoch_t e, epoch_t mine,
        int at) :
        MOSDFastDispatchOp{MSG_OSD_REPOPREPLY, HEAD_VERSION, COMPAT_VERSION},
    // ...
    }
    MOSDRepOpReply() : MOSDFastDispatchOp{MSG_OSD_REPOPREPLY, HEAD_VERSION, COMPAT_VERSION},
    // ...
};

4.3 do_repop_reply

该函数除了清除 in_progress_ops 中已完成的 op 外,还会调用 C_OSD_RepopCommit

do_repop_reply 里的 ip_op 来自第 2 节中的 InProgressOp &op = *insert_res.first->second;

结合第 2 节中 InProgressOp 类的定义,可以看出在第 1 节中 C_OSD_RepopCommit 回调实例 on_all_commit 作为 Context 实例赋值给 InProgressOp::on_commit

那么在 do_repop_reply 中,ip_op.on_commit->complete(0); 该行会调用 C_OSD_RepopCommit::finish

void ReplicatedBackend::do_repop_reply(OpRequestRef op)
{
    // ...
    auto iter = in_progress_ops.find(rep_tid);
    if (iter != in_progress_ops.end())
    {
        // ...
        InProgressOp &ip_op = *iter->second;
        // ...  
	  	
        if (ip_op.waiting_for_commit.empty() && ip_op.on_commit)
        {
        	ip_op.on_commit->complete(0);
          	ip_op.on_commit = 0;
        	in_progress_ops.erase(iter);
        }
    }
    
    // ...
}

5. repop_all_committed

repop_all_committed 主要是调用 eval_repop,答复客户端。

class C_OSD_RepopCommit : public Context
{
    // ...
    C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
        : pg(pg), repop(repop) {}
    void finish(int) override
    {
        pg->repop_all_committed(repop.get());
    }
};

void PrimaryLogPG::repop_all_committed(RepGather *repop)
{
    // ...

    repop->all_committed = true;
    if (!repop->rep_aborted) {
        if (repop->v != eversion_t()) {
            recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
        }
        eval_repop(repop);
    }
}

6. op_commit

op_commitC_OSD_OnOpCommit 回调调用的函数,注册代码见第 4.3 节。

op_commit 同第 5 节中的 repop_all_committed 一样,InProgressOp 对象调用 complete,调用回调 C_OSD_RepopCommit::finish

op_commitPrimary OSD 的回调,会在 transaction commit 之后调用。

class C_OSD_OnOpCommit : public Context
{
    // ...
    C_OSD_OnOpCommit(ReplicatedBackend *pg, ceph::ref_t<ReplicatedBackend::InProgressOp> op)
        : pg(pg), op(std::move(op)) {}
    void finish(int) override
    {
        pg->op_commit(op);
    }
};

void ReplicatedBackend::op_commit(const ceph::ref_t<InProgressOp> &op)
{
    // ...
	
    if (op->waiting_for_commit.empty() && op->on_commit) 
    {
        op->on_commit->complete(0);
        op->on_commit = 0;
        in_progress_ops.erase(op->tid);
    }
  
    // ...
}

7. 总结

  1. PrimaryLogPG::issue_repop 创建 C_OSD_RepopCommit 实例,作为参数调用 ReplicatedBackend::submit_transaction

  2. ReplicatedBackend::submit_transaction 调用 ReplicatedBackend::issue_op,使用 ReplicatedBackend::generate_subop 生成 typeMSG_OSD_REPOP 的消息,并发送给从 osd

  3. ReplicatedBackend::submit_transaction 使用第 1 步的 C_OSD_RepopCommit 实例,注册 C_OSD_OnOpCommit 回调。

  4. 2 步的消息一路经过 PrimaryLogPG::do_requestPGBackend::handle_messageReplicatedBackend::_handle_message,到达 ReplicatedBackend::do_repop 进行处理。

  5. 处理完消息后,创建 C_OSD_RepModifyCommit 回调实例。

  6. 完成从 osd 上写入后,调用第 5 步的 C_OSD_RepModifyCommit 回调,该回调最终调用 ReplicatedBackend::repop_commit

  7. ReplicatedBackend::repop_commit 创建 MOSDRepOpReply 实例,该实例将像第 2 步中发送的消息一样进入队列,该消息 typeMSG_OSD_REPOPREPLY

  8. 消息处理流程大体同第 4 步,只不过最后是调用 ReplicatedBackend::do_repop_reply

  9. ReplicatedBackend::do_repop_reply 检查 waiting_for_commit 是否为空,如果是,则调用第一步中的 C_OSD_RepopCommit 回调。

  10. C_OSD_RepopCommit 回调最后会调用 PrimaryLogPG::repop_all_committed,做收尾工作,并与客户端交互。

  11. C_OSD_OnOpCommit 回调在 transaction commit 之后调用。

  12. C_OSD_OnOpCommit 回调函数调用 ReplicatedBackend::op_commit,该函数一个主要的作用同第 9 和第 10 步骤。