diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index e74a4c288ba..1245f94db3a 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1992,3 +1992,12 @@ crt_req_get_proto_ver(crt_rpc_t *req) { return (req->cr_opc & CRT_PROTO_VER_MASK) >> 16; } + +void +crt_rpc_dump(crt_rpc_t *req, const char *msg) +{ + struct crt_rpc_priv *rpc_priv; + + rpc_priv = container_of(req, struct crt_rpc_priv, crp_pub); + RPC_INFO(rpc_priv, "%s\n", msg); +} diff --git a/src/container/srv_target.c b/src/container/srv_target.c index e172d29c4b1..ef05ad89e3b 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -184,7 +184,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), - atomic_load(&pool->sp_rebuilding), pool->sp_rebuild_scan); + atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_enum)); return false; } @@ -293,8 +293,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, daos_epoch_t epoch_max, epoch_min; daos_epoch_range_t epoch_range; struct sched_request *req = cont2req(cont, param->ap_vos_agg); - uint64_t hlc = d_hlc_get(); - uint64_t change_hlc; + uint64_t hlc = d_hlc_get(); uint64_t interval; uint64_t snapshots_local[MAX_SNAPSHOT_LOCAL] = { 0 }; uint64_t *snapshots = NULL; @@ -303,16 +302,14 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, uint32_t flags = 0; int i, rc = 0; - change_hlc = max(cont->sc_snapshot_delete_hlc, - cont->sc_pool->spc_rebuild_end_hlc); - if (param->ap_full_scan_hlc < change_hlc) { - /* Snapshot has been deleted or rebuild happens since the last + if (param->ap_full_scan_hlc < cont->sc_snapshot_delete_hlc) { + /* Snapshot has been deleted since the last * aggregation, let's restart from 0. */ epoch_min = 0; flags |= VOS_AGG_FL_FORCE_SCAN; - D_DEBUG(DB_EPC, "change hlc "DF_X64" > full "DF_X64"\n", - change_hlc, param->ap_full_scan_hlc); + D_DEBUG(DB_EPC, "snapshot del hlc " DF_X64 " > full " DF_X64 "\n", + cont->sc_snapshot_delete_hlc, param->ap_full_scan_hlc); } else { epoch_min = get_hae(cont, param->ap_vos_agg); } @@ -352,41 +349,18 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, D_DEBUG(DB_EPC, "hlc "DF_X64" epoch "DF_X64"/"DF_X64" agg max "DF_X64"\n", hlc, epoch_max, epoch_min, cont->sc_aggregation_max); - if (cont->sc_snapshots_nr + 1 < MAX_SNAPSHOT_LOCAL) { + snapshots_nr = cont->sc_snapshots_nr; + if (snapshots_nr < MAX_SNAPSHOT_LOCAL) { snapshots = snapshots_local; } else { - D_ALLOC(snapshots, (cont->sc_snapshots_nr + 1) * - sizeof(daos_epoch_t)); + D_ALLOC(snapshots, snapshots_nr * sizeof(daos_epoch_t)); if (snapshots == NULL) return -DER_NOMEM; } - if (cont->sc_pool->spc_rebuild_fence != 0) { - uint64_t rebuild_fence = cont->sc_pool->spc_rebuild_fence; - int j; - int insert_idx; - - /* insert rebuild_fetch into the snapshot list */ - D_DEBUG(DB_EPC, "rebuild fence "DF_X64"\n", rebuild_fence); - for (j = 0, insert_idx = 0; j < cont->sc_snapshots_nr; j++) { - if (cont->sc_snapshots[j] < rebuild_fence) { - snapshots[j] = cont->sc_snapshots[j]; - insert_idx++; - } else { - snapshots[j + 1] = cont->sc_snapshots[j]; - } - } - snapshots[insert_idx] = rebuild_fence; - snapshots_nr = cont->sc_snapshots_nr + 1; - } else { - /* Since sc_snapshots might be freed by other ULT, let's - * always copy here. - */ - snapshots_nr = cont->sc_snapshots_nr; - if (snapshots_nr > 0) - memcpy(snapshots, cont->sc_snapshots, - snapshots_nr * sizeof(daos_epoch_t)); - } + /* Since sc_snapshots might be freed by other ULT, let's always copy here. */ + if (snapshots_nr > 0) + memcpy(snapshots, cont->sc_snapshots, snapshots_nr * sizeof(daos_epoch_t)); /* Find highest snapshot less than last aggregated epoch. */ for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index ecd4f8d3407..31ede2f6074 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -743,7 +743,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, dca->dca_steps = length; /* Use helper ULT to handle DTX RPC if there are enough helper XS. */ - if (dss_has_enough_helper()) { + if (0 && dss_has_enough_helper()) { rc = ABT_eventual_create(0, &dca->dca_chore_eventual); if (rc != ABT_SUCCESS) { D_ERROR("failed to create eventual: %d\n", rc); diff --git a/src/include/cart/api.h b/src/include/cart/api.h index 404135adcae..5c54051f1ad 100644 --- a/src/include/cart/api.h +++ b/src/include/cart/api.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2286,6 +2286,14 @@ int crt_group_secondary_modify(crt_group_t *grp, d_rank_list_t *sec_ranks, d_rank_list_t *prim_ranks, crt_group_mod_op_t op, uint32_t version); +/** + * Dump CaRT RPC information. + * + * \param[in] req The rpc structure. + * \param[in] fmt The message to be logged. + */ +void crt_rpc_dump(crt_rpc_t *req, const char *msg); + /** * Initialize swim on the specified context index. * diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 5cb2d466027..a731cf7e101 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -112,7 +112,7 @@ struct ds_cont_child { * VOS aggregation will use this boundary. We will optimize it later. */ uint64_t sc_ec_agg_eph_boundary; - /* The current EC aggregate epoch for this xstream */ + /* The local EC aggregation epoch for this xstream */ uint64_t sc_ec_agg_eph; /* Used by cont_ec_eph_query_ult to query the minimum EC agg epoch from all * local VOS. @@ -142,7 +142,7 @@ struct ds_cont_child { struct agg_param { void *ap_data; struct ds_cont_child *ap_cont; - daos_epoch_t ap_full_scan_hlc; + daos_epoch_t ap_full_scan_hlc; bool ap_vos_agg; }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 861c488fa4a..f46abec202c 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -93,13 +93,10 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - ATOMIC int sp_rebuilding; ATOMIC int sp_discarding; - /** - * someone has already messaged this pool to for rebuild scan, - * NB: all xstreams can do lockless-write on it but it's OK - */ - int sp_rebuild_scan; + ATOMIC int sp_rebuilding; + /* someone has already messaged this pool to for rebuild object/key enumeration */ + ATOMIC int sp_rebuild_enum; int sp_discard_status; /** path to ephemeral metrics */ @@ -174,16 +171,6 @@ struct ds_pool_child { struct sched_request *spc_chkpt_req; /* Track checkpointing ULT*/ d_list_t spc_cont_list; - /* The current maxim rebuild epoch, (0 if there is no rebuild), so - * vos aggregation can not cross this epoch during rebuild to avoid - * interfering rebuild process. - */ - uint64_t spc_rebuild_fence; - - /* The HLC when current rebuild ends, which will be used to compare - * with the aggregation full scan start HLC to know whether the - * aggregation needs to be restarted from 0. */ - uint64_t spc_rebuild_end_hlc; uint32_t spc_map_version; int spc_ref; ABT_eventual spc_ref_eventual; @@ -215,7 +202,7 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (atomic_load(&pool->sp_rebuilding) > 0 || pool->sp_rebuild_scan > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || atomic_load(&pool->sp_rebuild_enum) > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index c7d319d26cf..e5d7fa4012c 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -1725,7 +1725,7 @@ dc_obj_retry_delay(tse_task_t *task, uint32_t opc, int err, uint32_t *retry_cnt, /* Randomly delay [1, max_delay - 5] for DER_OVERLOAD_RETRY case. */ if (err == -DER_OVERLOAD_RETRY) { delay = daos_rpc_rand_delay(timeout_sec) << 20; - } else if (++(*retry_cnt) > 1) { + } else if (++(*retry_cnt) > 1 || obj_is_modification_opc(opc)) { /* Randomly delay [31 ~ 1023] us if it is not the first retried object RPC. */ delay = (d_rand() | ((1 << 5) - 1)) & ((1 << 10) - 1); /* Rebuild is being established on the server side, wait a bit longer */ @@ -1739,16 +1739,18 @@ dc_obj_retry_delay(tse_task_t *task, uint32_t opc, int err, uint32_t *retry_cnt, delay <<= 8; break; case DAOS_OBJ_RPC_CPD: - /* 8 times of the delay for compounded RPC. */ - delay <<= 3; + delay <<= (*retry_cnt + 3); break; default: + if (obj_is_modification_opc(opc)) + delay <<= (*retry_cnt + 1); + else + delay <<= (*retry_cnt - 1); break; } - /* Increase delay after multiple times retry. */ - if (*retry_cnt >= 5) - delay <<= 1; + if (*retry_cnt > 10 || delay > 3000000) + delay = 3000000 + ((d_rand() | ((1 << 5) - 1)) & ((1 << 10) - 1)); } } diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 1b6dbfff4b3..12566667e5e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2486,20 +2486,6 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } - - /* All I/O during rebuilding, needs to wait for the rebuild fence to - * be generated (see rebuild_prepare_one()), which will create a boundary - * for rebuild, so the data after boundary(epoch) should not be rebuilt, - * which otherwise might be written duplicately, which might cause - * the failure in VOS. - */ - if ((flags & ORF_REBUILDING_IO) && - (!child->sc_pool->spc_pool->sp_disable_rebuild && - child->sc_pool->spc_rebuild_fence == 0)) { - D_ERROR("rebuilding "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); - return -DER_UPDATE_AGAIN; - } - return 0; } @@ -2891,14 +2877,18 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) * Pre-allocate DTX entry for handling resend under such case. */ rc = obj_local_rw(rpc, &ioc, dth); - if (rc != 0) + if (rc != 0) { DL_CDEBUG( rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || rc == -DER_ALREADY || (rc == -DER_EXIST && (orw->orw_api_flags & (DAOS_COND_DKEY_INSERT | DAOS_COND_AKEY_INSERT))) || (rc == -DER_NONEXIST && (orw->orw_api_flags & (DAOS_COND_DKEY_UPDATE | DAOS_COND_AKEY_UPDATE))), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(orw->orw_oid)); + DB_IO, DLOG_ERR, rc, "tgt_update " DF_UOID " with TX " DF_DTI, + DP_UOID(orw->orw_oid), DP_DTI(&orw->orw_dti)); + if (unlikely(rc == -DER_AGAIN)) + crt_rpc_dump(rpc, "tgt_update"); + } out: if (dth != NULL) @@ -3447,9 +3437,8 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, if (oei->oei_flags & ORF_FOR_MIGRATION) { /* just in case ds_pool::sp_rebuilding is not set, pause my local EC aggregation * by setting this flag. - * NB: it's a lockess write to shared data structure and it's harmless. */ - ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scan = 1; + atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_enum, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 4eb7f8ef2b5..70705c6f011 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -77,12 +77,7 @@ struct rebuild_tgt_pool_tracker { uint64_t rt_stable_epoch; /* Only used by reclaim job to discard those half-rebuild data */ - uint64_t rt_reclaim_epoch; - /* local rebuild epoch mainly to constrain the VOS aggregation - * to make sure aggregation will not cross the epoch - */ - uint64_t rt_rebuild_fence; - + uint64_t rt_reclaim_epoch; uint32_t rt_leader_rank; /* Global dtx resync version */ @@ -339,7 +334,8 @@ void rebuild_tgt_status_check_ult(void *arg); int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt); +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt); bool rebuild_status_match(struct rebuild_tgt_pool_tracker *rpt, diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index bc445f0e20b..d3bd7ca28cf 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1204,6 +1204,8 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) struct rebuild_scan_out *rso; struct rebuild_pool_tls *tls = NULL; struct rebuild_tgt_pool_tracker *rpt = NULL; + struct ds_pool *pool = NULL; + bool checker = false; int rc; rsi = crt_req_get(rpc); @@ -1214,6 +1216,13 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rsi->rsi_rebuild_ver, rsi->rsi_rebuild_gen, rsi->rsi_master_rank, rsi->rsi_leader_term, RB_OP_STR(rsi->rsi_rebuild_op)); + rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); + if (rc) { + D_ERROR("Can not find pool " DF_UUID ": %d\n", DP_UUID(rsi->rsi_pool_uuid), rc); + D_GOTO(out, rc); + } + atomic_fetch_add(&pool->sp_rebuilding, 1); + /* If PS leader has been changed, and rebuild version is also increased * due to adding new failure targets for rebuild, let's abort previous * rebuild. @@ -1321,7 +1330,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (daos_fail_check(DAOS_REBUILD_TGT_START_FAIL)) D_GOTO(out, rc = -DER_INVAL); - rc = rebuild_tgt_prepare(rpc, &rpt); + rc = rebuild_tgt_prepare(pool, rsi, &rpt); if (rc) D_GOTO(out, rc); @@ -1332,8 +1341,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rpt_put(rpt); D_GOTO(out, rc); } - - atomic_fetch_add(&rpt->rt_pool->sp_rebuilding, 1); /* reset in rebuild_tgt_fini */ + checker = true; rpt_get(rpt); /* step-3: start scan leader */ @@ -1344,14 +1352,21 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) } out: - if (tls && tls->rebuild_pool_status == 0 && rc != 0) + if (rc != 0 && tls && tls->rebuild_pool_status == 0) tls->rebuild_pool_status = rc; + if (pool) { + if (!checker) + atomic_fetch_sub(&pool->sp_rebuilding, 1); + ds_pool_put(pool); + } + if (rpt) { - if (rc) + if (!checker) rpt_delete(rpt); rpt_put(rpt); } + rso = crt_reply_get(rpc); rso->rso_status = rc; rso->rso_stable_epoch = d_hlc_get(); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index d21233c2f27..845dce26705 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -261,6 +261,8 @@ static void rpt_insert(struct rebuild_tgt_pool_tracker *rpt) { D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + + rpt_get(rpt); ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); d_list_add(&rpt->rt_list, &rebuild_gst.rg_tgt_tracker_list); ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); @@ -270,9 +272,13 @@ void rpt_delete(struct rebuild_tgt_pool_tracker *rpt) { D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + D_ASSERT(!d_list_empty(&rpt->rt_list)); + ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); d_list_del_init(&rpt->rt_list); ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); + + rpt_put(rpt); } struct rebuild_tgt_pool_tracker * @@ -2309,23 +2315,8 @@ rebuild_fini_one(void *arg) if (dpc == NULL) return 0; - /* Reset rebuild epoch, then reset the aggregation epoch, so - * it can aggregate the rebuild epoch. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - if (rpt->rt_rebuild_fence == dpc->spc_rebuild_fence) { - dpc->spc_rebuild_fence = 0; - dpc->spc_rebuild_end_hlc = d_hlc_get(); - D_DEBUG(DB_REBUILD, DF_UUID": Reset aggregation end hlc " - DF_U64"\n", DP_UUID(rpt->rt_pool_uuid), - dpc->spc_rebuild_end_hlc); - } else { - D_DEBUG(DB_REBUILD, DF_UUID": pool is still being rebuilt" - " rt_rebuild_fence "DF_U64" spc_rebuild_fence " - DF_U64"\n", DP_UUID(rpt->rt_pool_uuid), - rpt->rt_rebuild_fence, dpc->spc_rebuild_fence); - } - + D_DEBUG(DB_REBUILD, DF_RB ": rebuild fini for stable epoch " DF_U64 "\n", DP_RB_RPT(rpt), + rpt->rt_stable_epoch); ds_pool_child_put(dpc); return 0; } @@ -2341,21 +2332,21 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); - rpt->rt_pool->sp_rebuild_scan = 0; + + atomic_store(&rpt->rt_pool->sp_rebuild_enum, 0); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); D_ASSERT(rpt->rt_refcount > 0); rpt->rt_finishing = 1; /* Wait until all ult/tasks finish and release the rpt. - * NB: Because rebuild_tgt_fini will be only called in - * rebuild_tgt_status_check_ult, which will make sure when - * rt_refcount reaches to 1, either all rebuild is done or - * all ult/task has been aborted by rt_abort, i.e. no new - * ULT/task will be created after this check. So it is safe - * to destroy the rpt after this. + * NB: Because rebuild_tgt_fini will be only called in rebuild_tgt_status_check_ult, + * which will make sure when rt_refcount reaches to 2 (one by check ULT, the other by + * track list), either all rebuild is done or all ult/task has been aborted by rt_abort, + * i.e. no new ULT/task will be created after this check. So it is safe to destroy + * the rpt after this. */ - if (rpt->rt_refcount > 1) + if (rpt->rt_refcount > 2) ABT_cond_wait(rpt->rt_fini_cond, rpt->rt_lock); ABT_mutex_unlock(rpt->rt_lock); @@ -2376,7 +2367,6 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_INFO("Finalized rebuild for "DF_UUID", map_ver=%u.\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver); rpt_delete(rpt); - rpt_put(rpt); } void @@ -2542,8 +2532,8 @@ rebuild_tgt_status_check_ult(void *arg) sched_req_put(rpt->rt_ult); rpt->rt_ult = NULL; out: - rpt_put(rpt); rebuild_tgt_fini(rpt); + rpt_put(rpt); } /** @@ -2579,14 +2569,8 @@ rebuild_prepare_one(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - /* Set the rebuild epoch per VOS container, so VOS aggregation will not - * cross the epoch to cause problem. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - dpc->spc_rebuild_fence = rpt->rt_rebuild_fence; - D_DEBUG(DB_REBUILD, "open local container "DF_UUID"/"DF_UUID - " rebuild eph "DF_X64" "DF_RC"\n", DP_UUID(rpt->rt_pool_uuid), - DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_fence, DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " stable eph " DF_X64 "\n", + DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_stable_epoch); put: ds_pool_child_put(dpc); @@ -2648,10 +2632,9 @@ rpt_create(struct ds_pool *pool, uint32_t master_rank, uint32_t pm_ver, * each target get the scan rpc from the master. */ int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt) { - struct rebuild_scan_in *rsi = crt_req_get(rpc); - struct ds_pool *pool; struct rebuild_tgt_pool_tracker *rpt = NULL; struct rebuild_pool_tls *pool_tls; daos_prop_t prop = { 0 }; @@ -2662,12 +2645,6 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) D_DEBUG(DB_REBUILD, "prepare rebuild for "DF_UUID"/%d\n", DP_UUID(rsi->rsi_pool_uuid), rsi->rsi_rebuild_ver); - rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); - if (rc) { - D_ERROR("Can not find pool "DF_UUID": %d\n", DP_UUID(rsi->rsi_pool_uuid), rc); - return rc; - } - if (ds_pool_get_version(pool) < rsi->rsi_rebuild_ver) { D_INFO(DF_UUID" map %u < rsi_rebuild_ver %u\n", DP_UUID(rsi->rsi_pool_uuid), ds_pool_get_version(pool), @@ -2703,7 +2680,6 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) /* Let's add the rpt to the tracker list before IV fetch, which might yield, * to make sure the new coming request can find the rpt in the list. */ - rpt_get(rpt); rpt_insert(rpt); rc = ds_pool_iv_srv_hdl_fetch(pool, &rpt->rt_poh_uuid, &rpt->rt_coh_uuid); @@ -2732,31 +2708,24 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) if (pool_tls == NULL) D_GOTO(out, rc = -DER_NOMEM); - rpt->rt_rebuild_fence = d_hlc_get(); - rc = ds_pool_task_collective(rpt->rt_pool_uuid, - PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, - rebuild_prepare_one, rpt, 0); + rc = ds_pool_task_collective(rpt->rt_pool_uuid, + PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + rebuild_prepare_one, rpt, 0); if (rc) { - rpt->rt_rebuild_fence = 0; rebuild_pool_tls_destroy(pool_tls); D_GOTO(out, rc); } ABT_mutex_lock(rpt->rt_lock); + ds_pool_get(pool); rpt->rt_pool = pool; /* pin it */ ABT_mutex_unlock(rpt->rt_lock); *p_rpt = rpt; out: - if (rc) { - if (rpt) { - if (!d_list_empty(&rpt->rt_list)) { - rpt_delete(rpt); - rpt_put(rpt); - } - rpt_put(rpt); - } - ds_pool_put(pool); + if (rc && rpt) { + rpt_delete(rpt); + rpt_put(rpt); } daos_prop_fini(&prop); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index e08d63fd673..058fbded986 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -1523,8 +1523,10 @@ vos_dtx_validation(struct dtx_handle *dth) /* The DTX has been ever aborted. Return -DER_AGAIN to make related client to retry sometime * later without triggering dtx_abort(). */ - if (dth->dth_aborted || rc == DTX_ST_ABORTED || rc == DTX_ST_ABORTING) + if (dth->dth_aborted || rc == DTX_ST_ABORTED || rc == DTX_ST_ABORTING) { + D_WARN("Current DTX " DF_DTI " is aborted: %d\n", DP_DTI(&dth->dth_xid), rc); return -DER_AGAIN; + } return rc > 0 ? 0 : rc; } diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index 083ca2216a5..1e98e399902 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -360,8 +361,10 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, obj = container_of(lret, struct vos_object, obj_llink); } - if (obj->obj_zombie) + if (obj->obj_zombie) { + D_WARN("Hit zombie obj " DF_UOID ", need to retry\n", DP_UOID(oid)); D_GOTO(failed, rc = -DER_AGAIN); + } if (intent == DAOS_INTENT_KILL && !(flags & VOS_OBJ_KILL_DKEY)) { if (obj != &obj_local) {