diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index fa54b812a2c..d21233c2f27 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -24,6 +24,8 @@ #include "rebuild_internal.h" #define RBLD_CHECK_INTV 2000 /* milliseconds interval to check*/ +#define RBLD_LOG_INTV 300 /* seconds interval to print logs */ +#define RBLD_LOG_INTV_CNT (RBLD_LOG_INTV * 1000 / RBLD_CHECK_INTV) struct rebuild_global rebuild_gst; unsigned int rebuild_wait_ec_pause = 0; @@ -708,11 +710,13 @@ static void rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, struct rebuild_global_pool_tracker *rgt) { - double last_print = 0; - unsigned int total; - struct sched_req_attr attr = { 0 }; - d_rank_t myrank; - int rc; + uint64_t check_cnt = 0; + uint64_t last_print_cnt = 0; + uint64_t log_cnt_intv = 1; + unsigned int total; + struct sched_req_attr attr = {0}; + d_rank_t myrank; + int rc; rc = crt_group_size(pool->sp_group, &total); if (rc) @@ -728,15 +732,14 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, return; while (1) { - struct daos_rebuild_status *rs = &rgt->rgt_status; - char sbuf[RBLD_SBUF_LEN]; - double now; - char *str; - d_rank_list_t rank_list = {0}; - bool rebuild_abort = false; - int i; - - now = ABT_get_wtime(); + struct daos_rebuild_status *rs = &rgt->rgt_status; + char sbuf[RBLD_SBUF_LEN]; + char *str; + d_rank_list_t rank_list = {0}; + bool rebuild_abort = false; + int i; + + check_cnt++; ABT_rwlock_rdlock(pool->sp_lock); rc = map_ranks_init(pool->sp_map, PO_COMP_ST_UP | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | @@ -776,7 +779,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, } } - if (now - last_print > 20) + if (check_cnt - last_print_cnt >= log_cnt_intv) D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt), dom->do_comp.co_rank, dom->do_comp.co_status); @@ -833,16 +836,17 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, rs->rs_size, rs->rs_state, rs->rs_errno, rs->rs_fail_rank, rgt->rgt_stable_epoch, rgt->rgt_reclaim_epoch, rs->rs_seconds); - D_INFO("%s", sbuf); - if (rs->rs_state == DRS_COMPLETED || rebuild_gst.rg_abort || - rgt->rgt_abort) { + if (rs->rs_state == DRS_COMPLETED || rebuild_gst.rg_abort || rgt->rgt_abort) { + D_INFO("%s", sbuf); D_PRINT("%s", sbuf); break; } - /* print something at least for each 10 seconds */ - if (now - last_print > 10) { - last_print = now; + /* Exponential backoff to print at most every RBLD_LOG_INTV seconds */ + if (check_cnt - last_print_cnt >= log_cnt_intv) { + last_print_cnt = check_cnt; + log_cnt_intv = min(log_cnt_intv * 2, (uint64_t)RBLD_LOG_INTV_CNT); + D_INFO("%s", sbuf); D_PRINT("%s", sbuf); } sleep: @@ -2381,6 +2385,8 @@ rebuild_tgt_status_check_ult(void *arg) struct rebuild_tgt_pool_tracker *rpt = arg; struct sched_req_attr attr = { 0 }; uint32_t reported_dtx_resyc_ver = 0; + uint64_t check_cnt = 0; + uint64_t log_cnt_intv = 1; D_ASSERT(rpt != NULL); sched_req_attr_init(&attr, SCHED_REQ_MIGRATE, &rpt->rt_pool_uuid); @@ -2511,14 +2517,18 @@ rebuild_tgt_status_check_ult(void *arg) } } - D_INFO(DF_UUID " ver %d gen %u obj " DF_U64 "/" DF_U64 " rec " DF_U64 - " size " DF_U64 " scan done %d pull done %d scan gl done %d" - " gl done %d status %d abort %s\n", - DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver, - rpt->rt_pool->sp_rebuild_gen, iv.riv_toberb_obj_count, iv.riv_obj_count, - iv.riv_rec_count, iv.riv_size, rpt->rt_scan_done, iv.riv_pull_done, - rpt->rt_global_scan_done, rpt->rt_global_done, iv.riv_status, - rpt->rt_abort ? "yes" : "no"); + if (check_cnt % log_cnt_intv == 0 || rpt->rt_global_done || rpt->rt_abort) { + D_INFO(DF_UUID " ver %d gen %u obj " DF_U64 "/" DF_U64 " rec " DF_U64 + " size " DF_U64 " scan done %d pull done %d scan gl done %d" + " gl done %d status %d abort %s\n", + DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver, + rpt->rt_pool->sp_rebuild_gen, iv.riv_toberb_obj_count, + iv.riv_obj_count, iv.riv_rec_count, iv.riv_size, rpt->rt_scan_done, + iv.riv_pull_done, rpt->rt_global_scan_done, rpt->rt_global_done, + iv.riv_status, rpt->rt_abort ? "yes" : "no"); + log_cnt_intv = min(log_cnt_intv * 2, (uint64_t)RBLD_LOG_INTV_CNT); + } + check_cnt++; if (rpt->rt_global_done || rpt->rt_abort) break;