diff --git a/compute/pom.xml b/compute/pom.xml
index d7543119ad..1ee0aeb04e 100755
--- a/compute/pom.xml
+++ b/compute/pom.xml
@@ -137,5 +137,10 @@
configuration
${project.version}
+
+ junit
+ junit
+ test
+
diff --git a/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java b/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java
index 48ed6a7dc4..e0515decd1 100755
--- a/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java
+++ b/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java
@@ -28,6 +28,7 @@
import javax.persistence.Tuple;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
@@ -267,7 +268,10 @@ public void trackHost(String hostUuid) {
if (CoreGlobalProperty.UNIT_TEST_ON && !alwaysStartRightNow) {
t.start();
} else {
- t.startRightNow();
+ // ZSTAC-61971: jitter to avoid thundering herd when 3000+ hosts reconnect simultaneously
+ long jitterMs = ThreadLocalRandom.current().nextLong(
+ HostGlobalConfig.PING_HOST_INTERVAL.value(Long.class) * 1000);
+ t.startWithDelay(jitterMs, TimeUnit.MILLISECONDS);
}
logger.debug(String.format("starting tracking hosts[uuid:%s]", hostUuid));
diff --git a/compute/src/main/java/org/zstack/compute/host/PingLatencyEma.java b/compute/src/main/java/org/zstack/compute/host/PingLatencyEma.java
new file mode 100644
index 0000000000..c53c3b54ce
--- /dev/null
+++ b/compute/src/main/java/org/zstack/compute/host/PingLatencyEma.java
@@ -0,0 +1,36 @@
+package org.zstack.compute.host;
+
+/**
+ * Exponential Moving Average tracker for per-host ping latency.
+ * Used to compute adaptive ping timeouts (ZSTAC-67534).
+ */
+public class PingLatencyEma {
+ private volatile double emaMs = -1;
+ private final double alpha;
+ private final int timeoutMultiplier;
+
+ public PingLatencyEma(double alpha, int timeoutMultiplier) {
+ this.alpha = alpha;
+ this.timeoutMultiplier = timeoutMultiplier;
+ }
+
+ public void update(long latencyMs) {
+ if (emaMs < 0) {
+ emaMs = latencyMs;
+ } else {
+ emaMs = alpha * latencyMs + (1 - alpha) * emaMs;
+ }
+ }
+
+ public long computeAdaptiveTimeout(long globalTimeoutSeconds) {
+ if (emaMs <= 0) {
+ return globalTimeoutSeconds;
+ }
+ long emaBasedTimeout = (long) (emaMs * timeoutMultiplier / 1000) + 1;
+ return Math.min(Math.max(globalTimeoutSeconds, emaBasedTimeout), globalTimeoutSeconds * 3);
+ }
+
+ public double getEmaMs() {
+ return emaMs;
+ }
+}
diff --git a/compute/src/test/java/org/zstack/compute/host/HostTrackJitterTest.java b/compute/src/test/java/org/zstack/compute/host/HostTrackJitterTest.java
new file mode 100644
index 0000000000..d2d5b28425
--- /dev/null
+++ b/compute/src/test/java/org/zstack/compute/host/HostTrackJitterTest.java
@@ -0,0 +1,76 @@
+package org.zstack.compute.host;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.concurrent.ThreadLocalRandom;
+
+/**
+ * ZSTAC-61971: Verify jitter produces uniform distribution, not thundering herd.
+ */
+public class HostTrackJitterTest {
+
+ /**
+ * AC: Jitter 测试 — 重连时间戳呈均匀分布 (非全部 t=0)
+ * Simulates the jitter logic from HostTrackImpl.trackHost() for 3000 hosts.
+ */
+ @Test
+ public void testJitterDistribution() {
+ long pingIntervalSeconds = 60;
+ long intervalMs = pingIntervalSeconds * 1000;
+ int hostCount = 3000;
+ int bucketCount = 10;
+ int[] buckets = new int[bucketCount];
+
+ for (int i = 0; i < hostCount; i++) {
+ long jitterMs = ThreadLocalRandom.current().nextLong(intervalMs);
+ int bucket = (int) (jitterMs * bucketCount / intervalMs);
+ buckets[bucket]++;
+ }
+
+ // Each bucket should have ~300 hosts (3000/10).
+ // With uniform distribution, no bucket should be < 200 or > 400.
+ int expectedPerBucket = hostCount / bucketCount;
+ for (int i = 0; i < bucketCount; i++) {
+ Assert.assertTrue(
+ String.format("bucket %d has %d hosts, expected ~%d (uniform distribution)",
+ i, buckets[i], expectedPerBucket),
+ buckets[i] > expectedPerBucket / 2 && buckets[i] < expectedPerBucket * 2);
+ }
+ }
+
+ /**
+ * Verify jitter range is [0, intervalMs) — never negative, never >= intervalMs.
+ */
+ @Test
+ public void testJitterRange() {
+ long intervalMs = 60000;
+ for (int i = 0; i < 10000; i++) {
+ long jitter = ThreadLocalRandom.current().nextLong(intervalMs);
+ Assert.assertTrue("jitter should be >= 0", jitter >= 0);
+ Assert.assertTrue("jitter should be < intervalMs", jitter < intervalMs);
+ }
+ }
+
+ /**
+ * AC: Not all hosts start at t=0.
+ * At least 90% of hosts should have jitter > 0.
+ */
+ @Test
+ public void testNotAllStartAtZero() {
+ long intervalMs = 60000;
+ int hostCount = 3000;
+ int zeroCount = 0;
+ for (int i = 0; i < hostCount; i++) {
+ long jitter = ThreadLocalRandom.current().nextLong(intervalMs);
+ if (jitter == 0) {
+ zeroCount++;
+ }
+ }
+ // With 60000ms range, probability of jitter=0 is 1/60000.
+ // For 3000 hosts, expected zeros ≈ 0.05. Definitely < 10% of hosts.
+ Assert.assertTrue(
+ String.format("too many hosts at t=0: %d/%d", zeroCount, hostCount),
+ zeroCount < hostCount / 10);
+ }
+}
diff --git a/compute/src/test/java/org/zstack/compute/host/PingLatencyEmaTest.java b/compute/src/test/java/org/zstack/compute/host/PingLatencyEmaTest.java
new file mode 100644
index 0000000000..0426bfad98
--- /dev/null
+++ b/compute/src/test/java/org/zstack/compute/host/PingLatencyEmaTest.java
@@ -0,0 +1,88 @@
+package org.zstack.compute.host;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class PingLatencyEmaTest {
+
+ /**
+ * AC: 合成延迟序列 [100, 200, 150, 300, 250]ms → EMA 产出正确自适应超时
+ */
+ @Test
+ public void testEmaWithSyntheticLatencySequence() {
+ PingLatencyEma ema = new PingLatencyEma(0.3, 3);
+
+ // First sample: EMA = 100
+ ema.update(100);
+ Assert.assertEquals(100.0, ema.getEmaMs(), 0.01);
+
+ // Second: EMA = 0.3*200 + 0.7*100 = 60 + 70 = 130
+ ema.update(200);
+ Assert.assertEquals(130.0, ema.getEmaMs(), 0.01);
+
+ // Third: EMA = 0.3*150 + 0.7*130 = 45 + 91 = 136
+ ema.update(150);
+ Assert.assertEquals(136.0, ema.getEmaMs(), 0.01);
+
+ // Fourth: EMA = 0.3*300 + 0.7*136 = 90 + 95.2 = 185.2
+ ema.update(300);
+ Assert.assertEquals(185.2, ema.getEmaMs(), 0.01);
+
+ // Fifth: EMA = 0.3*250 + 0.7*185.2 = 75 + 129.64 = 204.64
+ ema.update(250);
+ Assert.assertEquals(204.64, ema.getEmaMs(), 0.01);
+ }
+
+ @Test
+ public void testAdaptiveTimeoutUsesGlobalWhenNoSamples() {
+ PingLatencyEma ema = new PingLatencyEma(0.3, 3);
+ // No samples yet → return global timeout
+ Assert.assertEquals(30, ema.computeAdaptiveTimeout(30));
+ }
+
+ @Test
+ public void testAdaptiveTimeoutNeverBelowGlobal() {
+ PingLatencyEma ema = new PingLatencyEma(0.3, 3);
+ // Low latency: EMA=50ms → ema*3/1000+1 = 1s, but global=30s wins
+ ema.update(50);
+ Assert.assertEquals(30, ema.computeAdaptiveTimeout(30));
+ }
+
+ @Test
+ public void testAdaptiveTimeoutIncreasesForHighLatency() {
+ PingLatencyEma ema = new PingLatencyEma(0.3, 3);
+ // High latency: EMA=15000ms → ema*3/1000+1 = 46s > global 30s
+ ema.update(15000);
+ long timeout = ema.computeAdaptiveTimeout(30);
+ Assert.assertEquals(46, timeout);
+ }
+
+ @Test
+ public void testAdaptiveTimeoutCappedAt3xGlobal() {
+ PingLatencyEma ema = new PingLatencyEma(0.3, 3);
+ // Very high latency: EMA=100000ms → ema*3/1000+1 = 301s, cap at 3*30=90s
+ ema.update(100000);
+ long timeout = ema.computeAdaptiveTimeout(30);
+ Assert.assertEquals(90, timeout);
+ }
+
+ @Test
+ public void testEmaConvergesAfterLatencyDrop() {
+ PingLatencyEma ema = new PingLatencyEma(0.3, 3);
+ // Spike then drop
+ ema.update(20000); // EMA = 20000
+ ema.update(100); // EMA = 0.3*100 + 0.7*20000 = 14030
+
+ // EMA still elevated — ema*3/1000+1 = 43s > global 30s
+ long timeout = ema.computeAdaptiveTimeout(30);
+ Assert.assertEquals(43, timeout);
+ Assert.assertTrue("timeout should be > global after spike", timeout > 30);
+
+ // After many low-latency pings, EMA converges down
+ for (int i = 0; i < 30; i++) {
+ ema.update(100);
+ }
+ // EMA should be close to 100ms now → timeout = max(30, 1) = 30
+ Assert.assertEquals(30, ema.computeAdaptiveTimeout(30));
+ }
+}
diff --git a/conf/db/upgrade/V6.0.0.1__gpu_add_scope.sql b/conf/db/upgrade/V6.0.0.1__gpu_add_scope.sql
new file mode 100644
index 0000000000..6626e773d4
--- /dev/null
+++ b/conf/db/upgrade/V6.0.0.1__gpu_add_scope.sql
@@ -0,0 +1,38 @@
+-- V6.0.0.1: Add scope and chassisUuid to GpuDeviceVO for unified GPU management
+
+DELIMITER $$
+DROP PROCEDURE IF EXISTS add_gpu_scope_columns$$
+CREATE PROCEDURE add_gpu_scope_columns()
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM information_schema.COLUMNS
+ WHERE TABLE_SCHEMA = DATABASE()
+ AND TABLE_NAME = 'GpuDeviceVO'
+ AND COLUMN_NAME = 'scope') THEN
+ ALTER TABLE `GpuDeviceVO` ADD COLUMN `scope` VARCHAR(32) DEFAULT 'VM' NOT NULL;
+ END IF;
+
+ IF NOT EXISTS (SELECT 1 FROM information_schema.COLUMNS
+ WHERE TABLE_SCHEMA = DATABASE()
+ AND TABLE_NAME = 'GpuDeviceVO'
+ AND COLUMN_NAME = 'chassisUuid') THEN
+ ALTER TABLE `GpuDeviceVO` ADD COLUMN `chassisUuid` VARCHAR(32) DEFAULT NULL;
+ END IF;
+
+ -- Mark HAMI-virtualized GPUs as CONTAINER scope
+ UPDATE `GpuDeviceVO` g
+ JOIN `PciDeviceVO` p ON g.`uuid` = p.`uuid`
+ SET g.`scope` = 'CONTAINER'
+ WHERE p.`virtStatus` = 'HAMI_VIRTUALIZED';
+
+ -- Index for scope-based queries
+ IF NOT EXISTS (SELECT 1 FROM information_schema.STATISTICS
+ WHERE TABLE_SCHEMA = DATABASE()
+ AND TABLE_NAME = 'GpuDeviceVO'
+ AND INDEX_NAME = 'idxGpuDeviceVOScope') THEN
+ CREATE INDEX `idxGpuDeviceVOScope` ON `GpuDeviceVO` (`scope`);
+ END IF;
+END$$
+DELIMITER ;
+
+CALL add_gpu_scope_columns();
+DROP PROCEDURE IF EXISTS add_gpu_scope_columns;
diff --git a/conf/db/upgrade/V6.0.0.2__bm2_gpu_migrate.sql b/conf/db/upgrade/V6.0.0.2__bm2_gpu_migrate.sql
new file mode 100644
index 0000000000..0aa772b9bf
--- /dev/null
+++ b/conf/db/upgrade/V6.0.0.2__bm2_gpu_migrate.sql
@@ -0,0 +1,88 @@
+-- V6.0.0.2: Migrate BareMetal2 GPU devices to unified GpuDeviceVO via phantom hosts
+-- Conditional: only runs if BareMetal2 tables exist (skipped in non-BM2 deployments)
+
+DELIMITER $$
+DROP PROCEDURE IF EXISTS bm2_gpu_migrate$$
+CREATE PROCEDURE bm2_gpu_migrate()
+BEGIN
+ IF (SELECT COUNT(*) FROM information_schema.TABLES
+ WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'BareMetal2ChassisGpuDeviceVO') > 0 THEN
+
+-- Step 1: Create phantom hosts for each BM2 chassis that has GPU devices
+INSERT INTO `HostEO` (
+ `uuid`, `name`, `description`,
+ `zoneUuid`, `clusterUuid`,
+ `managementIp`, `hypervisorType`, `state`, `status`,
+ `createDate`, `lastOpDate`
+)
+SELECT
+ CONCAT('ph-', SUBSTRING(c.`uuid`, 1, 29)),
+ CONCAT('[BM2] ', c.`name`),
+ CONCAT('Phantom host for BareMetal2 chassis ', c.`uuid`),
+ c.`zoneUuid`,
+ c.`clusterUuid`,
+ '',
+ 'BareMetal2',
+ 'Enabled',
+ 'Connected',
+ c.`createDate`,
+ NOW()
+FROM `BareMetal2ChassisVO` c
+WHERE c.`uuid` IN (
+ SELECT DISTINCT bm.`chassisUuid`
+ FROM `BareMetal2ChassisPciDeviceVO` bm
+ WHERE bm.`uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`)
+)
+AND NOT EXISTS (
+ SELECT 1 FROM `HostEO` h WHERE h.`uuid` = CONCAT('ph-', SUBSTRING(c.`uuid`, 1, 29))
+);
+
+-- Step 2: Migrate PCI base data to PciDeviceVO
+INSERT INTO `PciDeviceVO` (
+ `uuid`, `name`, `description`,
+ `hostUuid`,
+ `type`, `state`, `status`, `virtStatus`,
+ `vendorId`, `deviceId`, `subvendorId`, `subdeviceId`,
+ `pciDeviceAddress`, `iommuGroup`,
+ `vendor`, `device`,
+ `createDate`, `lastOpDate`
+)
+SELECT
+ bm.`uuid`, bm.`name`, bm.`description`,
+ CONCAT('ph-', SUBSTRING(bm.`chassisUuid`, 1, 29)),
+ bm.`type`, 'Enabled', 'Active', 'UNVIRTUALIZABLE',
+ bm.`vendorId`, bm.`deviceId`, bm.`subvendorId`, bm.`subdeviceId`,
+ bm.`pciDeviceAddress`, bm.`iommuGroup`,
+ bm.`vendor`, bm.`device`,
+ bm.`createDate`, bm.`lastOpDate`
+FROM `BareMetal2ChassisPciDeviceVO` bm
+WHERE bm.`uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`)
+AND NOT EXISTS (
+ SELECT 1 FROM `PciDeviceVO` p WHERE p.`uuid` = bm.`uuid`
+);
+
+-- Step 3: Migrate GPU extension data to GpuDeviceVO
+INSERT INTO `GpuDeviceVO` (
+ `uuid`, `serialNumber`, `memory`, `power`, `isDriverLoaded`,
+ `scope`, `chassisUuid`
+)
+SELECT
+ bg.`uuid`, bg.`serialNumber`, bg.`memory`, bg.`power`, bg.`isDriverLoaded`,
+ 'BARE_METAL', bm.`chassisUuid`
+FROM `BareMetal2ChassisGpuDeviceVO` bg
+JOIN `BareMetal2ChassisPciDeviceVO` bm ON bg.`uuid` = bm.`uuid`
+WHERE NOT EXISTS (
+ SELECT 1 FROM `GpuDeviceVO` g WHERE g.`uuid` = bg.`uuid`
+);
+
+-- Step 4: Mark old table as migrated (preserve for 1 version)
+ALTER TABLE `BareMetal2ChassisPciDeviceVO` ADD COLUMN IF NOT EXISTS `_migrated` TINYINT(1) DEFAULT 0;
+UPDATE `BareMetal2ChassisPciDeviceVO` SET `_migrated` = 1
+ WHERE `uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`);
+
+ END IF;
+END$$
+DELIMITER ;
+
+CALL bm2_gpu_migrate();
+DROP PROCEDURE IF EXISTS bm2_gpu_migrate;
diff --git a/conf/db/upgrade/V6.0.0.3__mdev_subtable.sql b/conf/db/upgrade/V6.0.0.3__mdev_subtable.sql
new file mode 100644
index 0000000000..276918cbbf
--- /dev/null
+++ b/conf/db/upgrade/V6.0.0.3__mdev_subtable.sql
@@ -0,0 +1,104 @@
+-- V6.0.0.3: Convert MdevDeviceVO from standalone to PciDeviceVO subtable
+
+DELIMITER $$
+DROP PROCEDURE IF EXISTS migrate_mdev_subtable$$
+CREATE PROCEDURE migrate_mdev_subtable()
+BEGIN
+ -- Only proceed if old MdevDeviceVO exists and is NOT yet a subtable of PciDeviceVO
+ IF EXISTS (SELECT 1 FROM information_schema.TABLES
+ WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'MdevDeviceVO')
+ AND NOT EXISTS (SELECT 1 FROM information_schema.TABLE_CONSTRAINTS
+ WHERE TABLE_SCHEMA = DATABASE()
+ AND TABLE_NAME = 'MdevDeviceVO'
+ AND CONSTRAINT_NAME = 'fkMdevDeviceVOPciDeviceVO') THEN
+
+ -- Step 1: Migrate MdevDeviceVO shared fields into PciDeviceVO
+ INSERT INTO `PciDeviceVO` (
+ `uuid`, `name`, `description`,
+ `hostUuid`, `parentUuid`, `vmInstanceUuid`,
+ `type`, `state`, `status`, `virtStatus`, `chooser`,
+ `vendor`,
+ `vendorId`, `deviceId`, `subvendorId`, `subdeviceId`,
+ `pciDeviceAddress`,
+ `createDate`, `lastOpDate`
+ )
+ SELECT
+ m.`uuid`, m.`name`, m.`description`,
+ m.`hostUuid`, m.`parentUuid`, m.`vmInstanceUuid`,
+ m.`type`,
+ m.`state`,
+ m.`status`,
+ 'VFIO_MDEV_VIRTUAL',
+ m.`chooser`,
+ m.`vendor`,
+ p.`vendorId`, p.`deviceId`, p.`subvendorId`, p.`subdeviceId`,
+ m.`mdevDeviceAddress`,
+ m.`createDate`, m.`lastOpDate`
+ FROM `MdevDeviceVO` m
+ LEFT JOIN `PciDeviceVO` p ON m.`parentUuid` = p.`uuid`
+ WHERE NOT EXISTS (
+ SELECT 1 FROM `PciDeviceVO` pci WHERE pci.`uuid` = m.`uuid`
+ );
+
+ -- Step 2: Rename old MdevDeviceVO table
+ RENAME TABLE `MdevDeviceVO` TO `MdevDeviceVO_old`;
+
+ -- Step 3: Create new MdevDeviceVO as subtable of PciDeviceVO
+ CREATE TABLE `MdevDeviceVO` (
+ `uuid` VARCHAR(32) NOT NULL UNIQUE,
+ `mdevSpecUuid` VARCHAR(32) DEFAULT NULL,
+ `mttyUuid` VARCHAR(32) DEFAULT NULL,
+ `mdevDeviceAddress` VARCHAR(128) DEFAULT NULL,
+ PRIMARY KEY (`uuid`),
+ CONSTRAINT `fkMdevDeviceVOPciDeviceVO`
+ FOREIGN KEY (`uuid`) REFERENCES `PciDeviceVO`(`uuid`) ON DELETE CASCADE,
+ CONSTRAINT `fkMdevDeviceVOMdevSpecVO`
+ FOREIGN KEY (`mdevSpecUuid`) REFERENCES `MdevDeviceSpecVO`(`uuid`) ON DELETE SET NULL,
+ CONSTRAINT `fkMdevDeviceVOMttyDeviceVO`
+ FOREIGN KEY (`mttyUuid`) REFERENCES `MttyDeviceVO`(`uuid`) ON DELETE CASCADE
+ ) ENGINE=InnoDB;
+
+ -- Step 4: Migrate mdev-specific data to new subtable
+ INSERT IGNORE INTO `MdevDeviceVO` (`uuid`, `mdevSpecUuid`, `mttyUuid`, `mdevDeviceAddress`)
+ SELECT `uuid`, `mdevSpecUuid`, `mttyUuid`, `mdevDeviceAddress`
+ FROM `MdevDeviceVO_old`;
+
+ END IF;
+
+ -- Only run AccountResourceRefVO migration if MdevDeviceVO table exists
+ IF EXISTS (SELECT 1 FROM information_schema.TABLES
+ WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'MdevDeviceVO') THEN
+
+ -- Step 5: Migrate AccountResourceRefVO references (safe to re-run)
+ INSERT IGNORE INTO `AccountResourceRefVO` (
+ `uuid`, `accountUuid`, `ownerAccountUuid`, `resourceUuid`,
+ `resourceType`, `concreteResourceType`, `createDate`, `lastOpDate`
+ )
+ SELECT
+ REPLACE(UUID(), '-', ''),
+ ar.`accountUuid`, ar.`ownerAccountUuid`,
+ ar.`resourceUuid`,
+ 'PciDeviceVO',
+ 'org.zstack.pciDevice.PciDeviceVO',
+ NOW(), NOW()
+ FROM `AccountResourceRefVO` ar
+ WHERE ar.`resourceType` = 'MdevDeviceVO'
+ AND ar.`resourceUuid` IN (SELECT `uuid` FROM `MdevDeviceVO`)
+ AND NOT EXISTS (
+ SELECT 1 FROM `AccountResourceRefVO` a2
+ WHERE a2.`resourceUuid` = ar.`resourceUuid` AND a2.`resourceType` = 'PciDeviceVO'
+ );
+
+ -- Step 6: Remove old MdevDeviceVO AccountResourceRefVO entries
+ DELETE FROM `AccountResourceRefVO`
+ WHERE `resourceType` = 'MdevDeviceVO'
+ AND `resourceUuid` IN (SELECT `uuid` FROM `MdevDeviceVO`);
+
+ END IF;
+
+ -- MdevDeviceVO_old preserved for rollback verification (drop in next version)
+END$$
+DELIMITER ;
+
+CALL migrate_mdev_subtable();
+DROP PROCEDURE IF EXISTS migrate_mdev_subtable;
diff --git a/core/src/main/java/org/zstack/core/thread/AsyncTimer.java b/core/src/main/java/org/zstack/core/thread/AsyncTimer.java
index 3f877ab061..54fe25b2ea 100755
--- a/core/src/main/java/org/zstack/core/thread/AsyncTimer.java
+++ b/core/src/main/java/org/zstack/core/thread/AsyncTimer.java
@@ -69,6 +69,17 @@ public void startRightNow() {
}
}
+ public void startWithDelay(long delay, TimeUnit delayUnit) {
+ if (cancelled.get()) {
+ throw new CloudRuntimeException("cannot start a cancelled timer");
+ }
+
+ cancel = thdf.submitTimeoutTask(this, delayUnit, delay);
+ if (logger.isTraceEnabled()) {
+ logger.trace(String.format("%s starts with delay %d %s", getName(), delay, delayUnit));
+ }
+ }
+
public void cancel() {
cancelled.set(true);
if (cancel != null) {
diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java
index a245757517..bf01e6234e 100755
--- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java
+++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java
@@ -169,6 +169,9 @@ public class KVMHost extends HostBase implements Host {
private KVMHostContext context;
+ // Per-host ping latency EMA for adaptive timeout (ZSTAC-67534)
+ private final PingLatencyEma pingLatencyEma = new PingLatencyEma(0.3, 3);
+
// ///////////////////// REST URL //////////////////////////
private String baseUrl;
private String connectPath;
@@ -5011,6 +5014,11 @@ public void run(FlowTrigger trigger, Map data) {
cmd.hostUuid = self.getUuid();
cmd.kvmagentPhysicalMemoryUsageAlarmThreshold = gcf.getConfigValue(KVMGlobalConfig.CATEGORY, KVMGlobalConfig.KVMAGENT_PHYSICAL_MEMORY_USAGE_ALARM_THRESHOLD.getName(), Long.class);
cmd.kvmagentPhysicalMemoryUsageHardLimit = gcf.getConfigValue(KVMGlobalConfig.CATEGORY, KVMGlobalConfig.KVMAGENT_PHYSICAL_MEMORY_USAGE_HARD_LIMIT.getName(), Long.class);
+
+ long globalTimeout = HostGlobalConfig.PING_HOST_TIMEOUT.value(Long.class);
+ long adaptiveTimeout = pingLatencyEma.computeAdaptiveTimeout(globalTimeout);
+ final long pingStartMs = System.currentTimeMillis();
+
restf.asyncJsonPost(pingPath, cmd, new JsonAsyncRESTCallback(trigger) {
@Override
public void fail(ErrorCode err) {
@@ -5019,6 +5027,8 @@ public void fail(ErrorCode err) {
@Override
public void success(PingResponse ret) {
+ pingLatencyEma.update(System.currentTimeMillis() - pingStartMs);
+
if (!ret.isSuccess()) {
trigger.fail(operr(ORG_ZSTACK_KVM_10091, "%s", ret.getError()));
return;
@@ -5047,7 +5057,7 @@ public void success(PingResponse ret) {
public Class getReturnClass() {
return PingResponse.class;
}
- },TimeUnit.SECONDS, HostGlobalConfig.PING_HOST_TIMEOUT.value(Long.class));
+ },TimeUnit.SECONDS, adaptiveTimeout);
}
});
diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchAction.java b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchAction.java
new file mode 100644
index 0000000000..b7b7ace265
--- /dev/null
+++ b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchAction.java
@@ -0,0 +1,107 @@
+package org.zstack.sdk;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.zstack.sdk.*;
+
+public class GetGpuMetricsBatchAction extends AbstractAction {
+
+ private static final HashMap parameterMap = new HashMap<>();
+
+ private static final HashMap nonAPIParameterMap = new HashMap<>();
+
+ public static class Result {
+ public ErrorCode error;
+ public org.zstack.sdk.GetGpuMetricsBatchResult value;
+
+ public Result throwExceptionIfError() {
+ if (error != null) {
+ throw new ApiException(
+ String.format("error[code: %s, description: %s, details: %s, globalErrorCode: %s]", error.code, error.description, error.details, error.globalErrorCode)
+ );
+ }
+
+ return this;
+ }
+ }
+
+ @Param(required = true, nonempty = true, nullElements = false, emptyString = true, noTrim = false)
+ public java.util.List gpuDeviceUuids;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.util.List metricNames;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.lang.Long startTime;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.lang.Long endTime;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.lang.Integer period;
+
+ @Param(required = false)
+ public java.util.List systemTags;
+
+ @Param(required = false)
+ public java.util.List userTags;
+
+ @Param(required = false)
+ public String sessionId;
+
+ @Param(required = false)
+ public String accessKeyId;
+
+ @Param(required = false)
+ public String accessKeySecret;
+
+ @Param(required = false)
+ public String requestIp;
+
+
+ private Result makeResult(ApiResult res) {
+ Result ret = new Result();
+ if (res.error != null) {
+ ret.error = res.error;
+ return ret;
+ }
+
+ org.zstack.sdk.GetGpuMetricsBatchResult value = res.getResult(org.zstack.sdk.GetGpuMetricsBatchResult.class);
+ ret.value = value == null ? new org.zstack.sdk.GetGpuMetricsBatchResult() : value;
+
+ return ret;
+ }
+
+ public Result call() {
+ ApiResult res = ZSClient.call(this);
+ return makeResult(res);
+ }
+
+ public void call(final Completion completion) {
+ ZSClient.call(this, new InternalCompletion() {
+ @Override
+ public void complete(ApiResult res) {
+ completion.complete(makeResult(res));
+ }
+ });
+ }
+
+ protected Map getParameterMap() {
+ return parameterMap;
+ }
+
+ protected Map getNonAPIParameterMap() {
+ return nonAPIParameterMap;
+ }
+
+ protected RestInfo getRestInfo() {
+ RestInfo info = new RestInfo();
+ info.httpMethod = "GET";
+ info.path = "/gpu-device/metrics/batch";
+ info.needSession = true;
+ info.needPoll = false;
+ info.parameterName = "";
+ return info;
+ }
+
+}
diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchResult.java b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchResult.java
new file mode 100644
index 0000000000..195301f5b0
--- /dev/null
+++ b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchResult.java
@@ -0,0 +1,14 @@
+package org.zstack.sdk;
+
+
+
+public class GetGpuMetricsBatchResult {
+ public java.util.Map metrics;
+ public void setMetrics(java.util.Map metrics) {
+ this.metrics = metrics;
+ }
+ public java.util.Map getMetrics() {
+ return this.metrics;
+ }
+
+}
diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeAction.java b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeAction.java
new file mode 100644
index 0000000000..7b3f09581e
--- /dev/null
+++ b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeAction.java
@@ -0,0 +1,107 @@
+package org.zstack.sdk;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.zstack.sdk.*;
+
+public class GetGpuResourceTreeAction extends AbstractAction {
+
+ private static final HashMap parameterMap = new HashMap<>();
+
+ private static final HashMap nonAPIParameterMap = new HashMap<>();
+
+ public static class Result {
+ public ErrorCode error;
+ public org.zstack.sdk.GetGpuResourceTreeResult value;
+
+ public Result throwExceptionIfError() {
+ if (error != null) {
+ throw new ApiException(
+ String.format("error[code: %s, description: %s, details: %s, globalErrorCode: %s]", error.code, error.description, error.details, error.globalErrorCode)
+ );
+ }
+
+ return this;
+ }
+ }
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.util.List zoneUuids;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.util.List clusterUuids;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.util.List hostUuids;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.lang.String scope;
+
+ @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false)
+ public java.lang.String allocateStatus;
+
+ @Param(required = false)
+ public java.util.List systemTags;
+
+ @Param(required = false)
+ public java.util.List userTags;
+
+ @Param(required = false)
+ public String sessionId;
+
+ @Param(required = false)
+ public String accessKeyId;
+
+ @Param(required = false)
+ public String accessKeySecret;
+
+ @Param(required = false)
+ public String requestIp;
+
+
+ private Result makeResult(ApiResult res) {
+ Result ret = new Result();
+ if (res.error != null) {
+ ret.error = res.error;
+ return ret;
+ }
+
+ org.zstack.sdk.GetGpuResourceTreeResult value = res.getResult(org.zstack.sdk.GetGpuResourceTreeResult.class);
+ ret.value = value == null ? new org.zstack.sdk.GetGpuResourceTreeResult() : value;
+
+ return ret;
+ }
+
+ public Result call() {
+ ApiResult res = ZSClient.call(this);
+ return makeResult(res);
+ }
+
+ public void call(final Completion completion) {
+ ZSClient.call(this, new InternalCompletion() {
+ @Override
+ public void complete(ApiResult res) {
+ completion.complete(makeResult(res));
+ }
+ });
+ }
+
+ protected Map getParameterMap() {
+ return parameterMap;
+ }
+
+ protected Map getNonAPIParameterMap() {
+ return nonAPIParameterMap;
+ }
+
+ protected RestInfo getRestInfo() {
+ RestInfo info = new RestInfo();
+ info.httpMethod = "GET";
+ info.path = "/gpu-device/resource-tree";
+ info.needSession = true;
+ info.needPoll = false;
+ info.parameterName = "";
+ return info;
+ }
+
+}
diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeResult.java b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeResult.java
new file mode 100644
index 0000000000..97262ae70c
--- /dev/null
+++ b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeResult.java
@@ -0,0 +1,38 @@
+package org.zstack.sdk;
+
+
+
+public class GetGpuResourceTreeResult {
+ public java.util.List tree;
+ public void setTree(java.util.List tree) {
+ this.tree = tree;
+ }
+ public java.util.List getTree() {
+ return this.tree;
+ }
+
+ public int totalGpuCount;
+ public void setTotalGpuCount(int totalGpuCount) {
+ this.totalGpuCount = totalGpuCount;
+ }
+ public int getTotalGpuCount() {
+ return this.totalGpuCount;
+ }
+
+ public int allocatedGpuCount;
+ public void setAllocatedGpuCount(int allocatedGpuCount) {
+ this.allocatedGpuCount = allocatedGpuCount;
+ }
+ public int getAllocatedGpuCount() {
+ return this.allocatedGpuCount;
+ }
+
+ public int unallocatedGpuCount;
+ public void setUnallocatedGpuCount(int unallocatedGpuCount) {
+ this.unallocatedGpuCount = unallocatedGpuCount;
+ }
+ public int getUnallocatedGpuCount() {
+ return this.unallocatedGpuCount;
+ }
+
+}
diff --git a/sdk/src/main/java/org/zstack/sdk/GpuMetricsEntry.java b/sdk/src/main/java/org/zstack/sdk/GpuMetricsEntry.java
new file mode 100644
index 0000000000..96fa15bae7
--- /dev/null
+++ b/sdk/src/main/java/org/zstack/sdk/GpuMetricsEntry.java
@@ -0,0 +1,94 @@
+package org.zstack.sdk;
+
+
+
+public class GpuMetricsEntry {
+ public java.lang.String gpuDeviceUuid;
+ public void setGpuDeviceUuid(java.lang.String gpuDeviceUuid) {
+ this.gpuDeviceUuid = gpuDeviceUuid;
+ }
+ public java.lang.String getGpuDeviceUuid() {
+ return this.gpuDeviceUuid;
+ }
+
+ public java.lang.String hostUuid;
+ public void setHostUuid(java.lang.String hostUuid) {
+ this.hostUuid = hostUuid;
+ }
+ public java.lang.String getHostUuid() {
+ return this.hostUuid;
+ }
+
+ public java.lang.String vmInstanceUuid;
+ public void setVmInstanceUuid(java.lang.String vmInstanceUuid) {
+ this.vmInstanceUuid = vmInstanceUuid;
+ }
+ public java.lang.String getVmInstanceUuid() {
+ return this.vmInstanceUuid;
+ }
+
+ public java.lang.String pciDeviceAddress;
+ public void setPciDeviceAddress(java.lang.String pciDeviceAddress) {
+ this.pciDeviceAddress = pciDeviceAddress;
+ }
+ public java.lang.String getPciDeviceAddress() {
+ return this.pciDeviceAddress;
+ }
+
+ public java.lang.Double utilization;
+ public void setUtilization(java.lang.Double utilization) {
+ this.utilization = utilization;
+ }
+ public java.lang.Double getUtilization() {
+ return this.utilization;
+ }
+
+ public java.lang.Double memoryUtilization;
+ public void setMemoryUtilization(java.lang.Double memoryUtilization) {
+ this.memoryUtilization = memoryUtilization;
+ }
+ public java.lang.Double getMemoryUtilization() {
+ return this.memoryUtilization;
+ }
+
+ public java.lang.Double temperature;
+ public void setTemperature(java.lang.Double temperature) {
+ this.temperature = temperature;
+ }
+ public java.lang.Double getTemperature() {
+ return this.temperature;
+ }
+
+ public java.lang.Double powerDraw;
+ public void setPowerDraw(java.lang.Double powerDraw) {
+ this.powerDraw = powerDraw;
+ }
+ public java.lang.Double getPowerDraw() {
+ return this.powerDraw;
+ }
+
+ public java.lang.Double fanSpeed;
+ public void setFanSpeed(java.lang.Double fanSpeed) {
+ this.fanSpeed = fanSpeed;
+ }
+ public java.lang.Double getFanSpeed() {
+ return this.fanSpeed;
+ }
+
+ public java.lang.String gpuStatus;
+ public void setGpuStatus(java.lang.String gpuStatus) {
+ this.gpuStatus = gpuStatus;
+ }
+ public java.lang.String getGpuStatus() {
+ return this.gpuStatus;
+ }
+
+ public java.util.Map extraMetrics;
+ public void setExtraMetrics(java.util.Map extraMetrics) {
+ this.extraMetrics = extraMetrics;
+ }
+ public java.util.Map getExtraMetrics() {
+ return this.extraMetrics;
+ }
+
+}
diff --git a/sdk/src/main/java/org/zstack/sdk/GpuResourceTreeNode.java b/sdk/src/main/java/org/zstack/sdk/GpuResourceTreeNode.java
new file mode 100644
index 0000000000..9510f9d319
--- /dev/null
+++ b/sdk/src/main/java/org/zstack/sdk/GpuResourceTreeNode.java
@@ -0,0 +1,78 @@
+package org.zstack.sdk;
+
+
+
+public class GpuResourceTreeNode {
+ public java.lang.String uuid;
+ public void setUuid(java.lang.String uuid) {
+ this.uuid = uuid;
+ }
+ public java.lang.String getUuid() {
+ return this.uuid;
+ }
+
+ public java.lang.String name;
+ public void setName(java.lang.String name) {
+ this.name = name;
+ }
+ public java.lang.String getName() {
+ return this.name;
+ }
+
+ public java.lang.String nodeType;
+ public void setNodeType(java.lang.String nodeType) {
+ this.nodeType = nodeType;
+ }
+ public java.lang.String getNodeType() {
+ return this.nodeType;
+ }
+
+ public int totalGpuCount;
+ public void setTotalGpuCount(int totalGpuCount) {
+ this.totalGpuCount = totalGpuCount;
+ }
+ public int getTotalGpuCount() {
+ return this.totalGpuCount;
+ }
+
+ public int allocatedGpuCount;
+ public void setAllocatedGpuCount(int allocatedGpuCount) {
+ this.allocatedGpuCount = allocatedGpuCount;
+ }
+ public int getAllocatedGpuCount() {
+ return this.allocatedGpuCount;
+ }
+
+ public int unallocatedGpuCount;
+ public void setUnallocatedGpuCount(int unallocatedGpuCount) {
+ this.unallocatedGpuCount = unallocatedGpuCount;
+ }
+ public int getUnallocatedGpuCount() {
+ return this.unallocatedGpuCount;
+ }
+
+ public org.zstack.sdk.GpuDeviceInventory gpu;
+ public void setGpu(org.zstack.sdk.GpuDeviceInventory gpu) {
+ this.gpu = gpu;
+ }
+ public org.zstack.sdk.GpuDeviceInventory getGpu() {
+ return this.gpu;
+ }
+
+ public int mdevChildrenCount;
+ public void setMdevChildrenCount(int mdevChildrenCount) {
+ this.mdevChildrenCount = mdevChildrenCount;
+ }
+ public int getMdevChildrenCount() {
+ return this.mdevChildrenCount;
+ }
+
+ public java.util.List children;
+ public void setChildren(java.util.List children) {
+ this.children = children;
+ }
+ public java.util.List getChildren() {
+ return this.children;
+ }
+
+}