Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 39 additions & 29 deletions src/rebuild/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "rebuild_internal.h"

#define RBLD_CHECK_INTV 2000 /* milliseconds interval to check*/
#define RBLD_LOG_INTV 300 /* seconds interval to print logs */
#define RBLD_LOG_INTV_CNT (RBLD_LOG_INTV * 1000 / RBLD_CHECK_INTV)
struct rebuild_global rebuild_gst;
unsigned int rebuild_wait_ec_pause = 0;

Expand Down Expand Up @@ -708,11 +710,13 @@ static void
rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
struct rebuild_global_pool_tracker *rgt)
{
double last_print = 0;
unsigned int total;
struct sched_req_attr attr = { 0 };
d_rank_t myrank;
int rc;
uint64_t check_cnt = 0;
uint64_t last_print_cnt = 0;
uint64_t log_cnt_intv = 1;
unsigned int total;
struct sched_req_attr attr = {0};
d_rank_t myrank;
int rc;

rc = crt_group_size(pool->sp_group, &total);
if (rc)
Expand All @@ -728,15 +732,14 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
return;

while (1) {
struct daos_rebuild_status *rs = &rgt->rgt_status;
char sbuf[RBLD_SBUF_LEN];
double now;
char *str;
d_rank_list_t rank_list = {0};
bool rebuild_abort = false;
int i;

now = ABT_get_wtime();
struct daos_rebuild_status *rs = &rgt->rgt_status;
char sbuf[RBLD_SBUF_LEN];
char *str;
d_rank_list_t rank_list = {0};
bool rebuild_abort = false;
int i;

check_cnt++;
ABT_rwlock_rdlock(pool->sp_lock);
rc = map_ranks_init(pool->sp_map,
PO_COMP_ST_UP | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT |
Expand Down Expand Up @@ -776,7 +779,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
}
}

if (now - last_print > 20)
if (check_cnt - last_print_cnt >= log_cnt_intv)
D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt),
dom->do_comp.co_rank, dom->do_comp.co_status);

Expand Down Expand Up @@ -833,16 +836,17 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
rs->rs_size, rs->rs_state, rs->rs_errno, rs->rs_fail_rank,
rgt->rgt_stable_epoch, rgt->rgt_reclaim_epoch, rs->rs_seconds);

D_INFO("%s", sbuf);
if (rs->rs_state == DRS_COMPLETED || rebuild_gst.rg_abort ||
rgt->rgt_abort) {
if (rs->rs_state == DRS_COMPLETED || rebuild_gst.rg_abort || rgt->rgt_abort) {
D_INFO("%s", sbuf);
D_PRINT("%s", sbuf);
break;
}

/* print something at least for each 10 seconds */
if (now - last_print > 10) {
last_print = now;
/* Exponential backoff to print at most every RBLD_LOG_INTV seconds */
if (check_cnt - last_print_cnt >= log_cnt_intv) {
last_print_cnt = check_cnt;
log_cnt_intv = min(log_cnt_intv * 2, (uint64_t)RBLD_LOG_INTV_CNT);
D_INFO("%s", sbuf);
D_PRINT("%s", sbuf);
}
sleep:
Expand Down Expand Up @@ -2381,6 +2385,8 @@ rebuild_tgt_status_check_ult(void *arg)
struct rebuild_tgt_pool_tracker *rpt = arg;
struct sched_req_attr attr = { 0 };
uint32_t reported_dtx_resyc_ver = 0;
uint64_t check_cnt = 0;
uint64_t log_cnt_intv = 1;

D_ASSERT(rpt != NULL);
sched_req_attr_init(&attr, SCHED_REQ_MIGRATE, &rpt->rt_pool_uuid);
Expand Down Expand Up @@ -2511,14 +2517,18 @@ rebuild_tgt_status_check_ult(void *arg)
}
}

D_INFO(DF_UUID " ver %d gen %u obj " DF_U64 "/" DF_U64 " rec " DF_U64
" size " DF_U64 " scan done %d pull done %d scan gl done %d"
" gl done %d status %d abort %s\n",
DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver,
rpt->rt_pool->sp_rebuild_gen, iv.riv_toberb_obj_count, iv.riv_obj_count,
iv.riv_rec_count, iv.riv_size, rpt->rt_scan_done, iv.riv_pull_done,
rpt->rt_global_scan_done, rpt->rt_global_done, iv.riv_status,
rpt->rt_abort ? "yes" : "no");
if (check_cnt % log_cnt_intv == 0 || rpt->rt_global_done || rpt->rt_abort) {
D_INFO(DF_UUID " ver %d gen %u obj " DF_U64 "/" DF_U64 " rec " DF_U64
" size " DF_U64 " scan done %d pull done %d scan gl done %d"
" gl done %d status %d abort %s\n",
DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver,
rpt->rt_pool->sp_rebuild_gen, iv.riv_toberb_obj_count,
iv.riv_obj_count, iv.riv_rec_count, iv.riv_size, rpt->rt_scan_done,
iv.riv_pull_done, rpt->rt_global_scan_done, rpt->rt_global_done,
iv.riv_status, rpt->rt_abort ? "yes" : "no");
log_cnt_intv = min(log_cnt_intv * 2, (uint64_t)RBLD_LOG_INTV_CNT);
}
check_cnt++;
if (rpt->rt_global_done || rpt->rt_abort)
break;

Expand Down
Loading