diff --git a/compute/pom.xml b/compute/pom.xml index d7543119ad..1ee0aeb04e 100755 --- a/compute/pom.xml +++ b/compute/pom.xml @@ -137,5 +137,10 @@ configuration ${project.version} + + junit + junit + test + diff --git a/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java b/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java index 48ed6a7dc4..e0515decd1 100755 --- a/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java +++ b/compute/src/main/java/org/zstack/compute/host/HostTrackImpl.java @@ -28,6 +28,7 @@ import javax.persistence.Tuple; import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; @@ -267,7 +268,10 @@ public void trackHost(String hostUuid) { if (CoreGlobalProperty.UNIT_TEST_ON && !alwaysStartRightNow) { t.start(); } else { - t.startRightNow(); + // ZSTAC-61971: jitter to avoid thundering herd when 3000+ hosts reconnect simultaneously + long jitterMs = ThreadLocalRandom.current().nextLong( + HostGlobalConfig.PING_HOST_INTERVAL.value(Long.class) * 1000); + t.startWithDelay(jitterMs, TimeUnit.MILLISECONDS); } logger.debug(String.format("starting tracking hosts[uuid:%s]", hostUuid)); diff --git a/compute/src/main/java/org/zstack/compute/host/PingLatencyEma.java b/compute/src/main/java/org/zstack/compute/host/PingLatencyEma.java new file mode 100644 index 0000000000..c53c3b54ce --- /dev/null +++ b/compute/src/main/java/org/zstack/compute/host/PingLatencyEma.java @@ -0,0 +1,36 @@ +package org.zstack.compute.host; + +/** + * Exponential Moving Average tracker for per-host ping latency. + * Used to compute adaptive ping timeouts (ZSTAC-67534). + */ +public class PingLatencyEma { + private volatile double emaMs = -1; + private final double alpha; + private final int timeoutMultiplier; + + public PingLatencyEma(double alpha, int timeoutMultiplier) { + this.alpha = alpha; + this.timeoutMultiplier = timeoutMultiplier; + } + + public void update(long latencyMs) { + if (emaMs < 0) { + emaMs = latencyMs; + } else { + emaMs = alpha * latencyMs + (1 - alpha) * emaMs; + } + } + + public long computeAdaptiveTimeout(long globalTimeoutSeconds) { + if (emaMs <= 0) { + return globalTimeoutSeconds; + } + long emaBasedTimeout = (long) (emaMs * timeoutMultiplier / 1000) + 1; + return Math.min(Math.max(globalTimeoutSeconds, emaBasedTimeout), globalTimeoutSeconds * 3); + } + + public double getEmaMs() { + return emaMs; + } +} diff --git a/compute/src/test/java/org/zstack/compute/host/HostTrackJitterTest.java b/compute/src/test/java/org/zstack/compute/host/HostTrackJitterTest.java new file mode 100644 index 0000000000..d2d5b28425 --- /dev/null +++ b/compute/src/test/java/org/zstack/compute/host/HostTrackJitterTest.java @@ -0,0 +1,76 @@ +package org.zstack.compute.host; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.concurrent.ThreadLocalRandom; + +/** + * ZSTAC-61971: Verify jitter produces uniform distribution, not thundering herd. + */ +public class HostTrackJitterTest { + + /** + * AC: Jitter 测试 — 重连时间戳呈均匀分布 (非全部 t=0) + * Simulates the jitter logic from HostTrackImpl.trackHost() for 3000 hosts. + */ + @Test + public void testJitterDistribution() { + long pingIntervalSeconds = 60; + long intervalMs = pingIntervalSeconds * 1000; + int hostCount = 3000; + int bucketCount = 10; + int[] buckets = new int[bucketCount]; + + for (int i = 0; i < hostCount; i++) { + long jitterMs = ThreadLocalRandom.current().nextLong(intervalMs); + int bucket = (int) (jitterMs * bucketCount / intervalMs); + buckets[bucket]++; + } + + // Each bucket should have ~300 hosts (3000/10). + // With uniform distribution, no bucket should be < 200 or > 400. + int expectedPerBucket = hostCount / bucketCount; + for (int i = 0; i < bucketCount; i++) { + Assert.assertTrue( + String.format("bucket %d has %d hosts, expected ~%d (uniform distribution)", + i, buckets[i], expectedPerBucket), + buckets[i] > expectedPerBucket / 2 && buckets[i] < expectedPerBucket * 2); + } + } + + /** + * Verify jitter range is [0, intervalMs) — never negative, never >= intervalMs. + */ + @Test + public void testJitterRange() { + long intervalMs = 60000; + for (int i = 0; i < 10000; i++) { + long jitter = ThreadLocalRandom.current().nextLong(intervalMs); + Assert.assertTrue("jitter should be >= 0", jitter >= 0); + Assert.assertTrue("jitter should be < intervalMs", jitter < intervalMs); + } + } + + /** + * AC: Not all hosts start at t=0. + * At least 90% of hosts should have jitter > 0. + */ + @Test + public void testNotAllStartAtZero() { + long intervalMs = 60000; + int hostCount = 3000; + int zeroCount = 0; + for (int i = 0; i < hostCount; i++) { + long jitter = ThreadLocalRandom.current().nextLong(intervalMs); + if (jitter == 0) { + zeroCount++; + } + } + // With 60000ms range, probability of jitter=0 is 1/60000. + // For 3000 hosts, expected zeros ≈ 0.05. Definitely < 10% of hosts. + Assert.assertTrue( + String.format("too many hosts at t=0: %d/%d", zeroCount, hostCount), + zeroCount < hostCount / 10); + } +} diff --git a/compute/src/test/java/org/zstack/compute/host/PingLatencyEmaTest.java b/compute/src/test/java/org/zstack/compute/host/PingLatencyEmaTest.java new file mode 100644 index 0000000000..0426bfad98 --- /dev/null +++ b/compute/src/test/java/org/zstack/compute/host/PingLatencyEmaTest.java @@ -0,0 +1,88 @@ +package org.zstack.compute.host; + +import org.junit.Assert; +import org.junit.Test; + +public class PingLatencyEmaTest { + + /** + * AC: 合成延迟序列 [100, 200, 150, 300, 250]ms → EMA 产出正确自适应超时 + */ + @Test + public void testEmaWithSyntheticLatencySequence() { + PingLatencyEma ema = new PingLatencyEma(0.3, 3); + + // First sample: EMA = 100 + ema.update(100); + Assert.assertEquals(100.0, ema.getEmaMs(), 0.01); + + // Second: EMA = 0.3*200 + 0.7*100 = 60 + 70 = 130 + ema.update(200); + Assert.assertEquals(130.0, ema.getEmaMs(), 0.01); + + // Third: EMA = 0.3*150 + 0.7*130 = 45 + 91 = 136 + ema.update(150); + Assert.assertEquals(136.0, ema.getEmaMs(), 0.01); + + // Fourth: EMA = 0.3*300 + 0.7*136 = 90 + 95.2 = 185.2 + ema.update(300); + Assert.assertEquals(185.2, ema.getEmaMs(), 0.01); + + // Fifth: EMA = 0.3*250 + 0.7*185.2 = 75 + 129.64 = 204.64 + ema.update(250); + Assert.assertEquals(204.64, ema.getEmaMs(), 0.01); + } + + @Test + public void testAdaptiveTimeoutUsesGlobalWhenNoSamples() { + PingLatencyEma ema = new PingLatencyEma(0.3, 3); + // No samples yet → return global timeout + Assert.assertEquals(30, ema.computeAdaptiveTimeout(30)); + } + + @Test + public void testAdaptiveTimeoutNeverBelowGlobal() { + PingLatencyEma ema = new PingLatencyEma(0.3, 3); + // Low latency: EMA=50ms → ema*3/1000+1 = 1s, but global=30s wins + ema.update(50); + Assert.assertEquals(30, ema.computeAdaptiveTimeout(30)); + } + + @Test + public void testAdaptiveTimeoutIncreasesForHighLatency() { + PingLatencyEma ema = new PingLatencyEma(0.3, 3); + // High latency: EMA=15000ms → ema*3/1000+1 = 46s > global 30s + ema.update(15000); + long timeout = ema.computeAdaptiveTimeout(30); + Assert.assertEquals(46, timeout); + } + + @Test + public void testAdaptiveTimeoutCappedAt3xGlobal() { + PingLatencyEma ema = new PingLatencyEma(0.3, 3); + // Very high latency: EMA=100000ms → ema*3/1000+1 = 301s, cap at 3*30=90s + ema.update(100000); + long timeout = ema.computeAdaptiveTimeout(30); + Assert.assertEquals(90, timeout); + } + + @Test + public void testEmaConvergesAfterLatencyDrop() { + PingLatencyEma ema = new PingLatencyEma(0.3, 3); + // Spike then drop + ema.update(20000); // EMA = 20000 + ema.update(100); // EMA = 0.3*100 + 0.7*20000 = 14030 + + // EMA still elevated — ema*3/1000+1 = 43s > global 30s + long timeout = ema.computeAdaptiveTimeout(30); + Assert.assertEquals(43, timeout); + Assert.assertTrue("timeout should be > global after spike", timeout > 30); + + // After many low-latency pings, EMA converges down + for (int i = 0; i < 30; i++) { + ema.update(100); + } + // EMA should be close to 100ms now → timeout = max(30, 1) = 30 + Assert.assertEquals(30, ema.computeAdaptiveTimeout(30)); + } +} diff --git a/conf/db/upgrade/V6.0.0.1__gpu_add_scope.sql b/conf/db/upgrade/V6.0.0.1__gpu_add_scope.sql new file mode 100644 index 0000000000..6626e773d4 --- /dev/null +++ b/conf/db/upgrade/V6.0.0.1__gpu_add_scope.sql @@ -0,0 +1,38 @@ +-- V6.0.0.1: Add scope and chassisUuid to GpuDeviceVO for unified GPU management + +DELIMITER $$ +DROP PROCEDURE IF EXISTS add_gpu_scope_columns$$ +CREATE PROCEDURE add_gpu_scope_columns() +BEGIN + IF NOT EXISTS (SELECT 1 FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME = 'GpuDeviceVO' + AND COLUMN_NAME = 'scope') THEN + ALTER TABLE `GpuDeviceVO` ADD COLUMN `scope` VARCHAR(32) DEFAULT 'VM' NOT NULL; + END IF; + + IF NOT EXISTS (SELECT 1 FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME = 'GpuDeviceVO' + AND COLUMN_NAME = 'chassisUuid') THEN + ALTER TABLE `GpuDeviceVO` ADD COLUMN `chassisUuid` VARCHAR(32) DEFAULT NULL; + END IF; + + -- Mark HAMI-virtualized GPUs as CONTAINER scope + UPDATE `GpuDeviceVO` g + JOIN `PciDeviceVO` p ON g.`uuid` = p.`uuid` + SET g.`scope` = 'CONTAINER' + WHERE p.`virtStatus` = 'HAMI_VIRTUALIZED'; + + -- Index for scope-based queries + IF NOT EXISTS (SELECT 1 FROM information_schema.STATISTICS + WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME = 'GpuDeviceVO' + AND INDEX_NAME = 'idxGpuDeviceVOScope') THEN + CREATE INDEX `idxGpuDeviceVOScope` ON `GpuDeviceVO` (`scope`); + END IF; +END$$ +DELIMITER ; + +CALL add_gpu_scope_columns(); +DROP PROCEDURE IF EXISTS add_gpu_scope_columns; diff --git a/conf/db/upgrade/V6.0.0.2__bm2_gpu_migrate.sql b/conf/db/upgrade/V6.0.0.2__bm2_gpu_migrate.sql new file mode 100644 index 0000000000..0aa772b9bf --- /dev/null +++ b/conf/db/upgrade/V6.0.0.2__bm2_gpu_migrate.sql @@ -0,0 +1,88 @@ +-- V6.0.0.2: Migrate BareMetal2 GPU devices to unified GpuDeviceVO via phantom hosts +-- Conditional: only runs if BareMetal2 tables exist (skipped in non-BM2 deployments) + +DELIMITER $$ +DROP PROCEDURE IF EXISTS bm2_gpu_migrate$$ +CREATE PROCEDURE bm2_gpu_migrate() +BEGIN + IF (SELECT COUNT(*) FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'BareMetal2ChassisGpuDeviceVO') > 0 THEN + +-- Step 1: Create phantom hosts for each BM2 chassis that has GPU devices +INSERT INTO `HostEO` ( + `uuid`, `name`, `description`, + `zoneUuid`, `clusterUuid`, + `managementIp`, `hypervisorType`, `state`, `status`, + `createDate`, `lastOpDate` +) +SELECT + CONCAT('ph-', SUBSTRING(c.`uuid`, 1, 29)), + CONCAT('[BM2] ', c.`name`), + CONCAT('Phantom host for BareMetal2 chassis ', c.`uuid`), + c.`zoneUuid`, + c.`clusterUuid`, + '', + 'BareMetal2', + 'Enabled', + 'Connected', + c.`createDate`, + NOW() +FROM `BareMetal2ChassisVO` c +WHERE c.`uuid` IN ( + SELECT DISTINCT bm.`chassisUuid` + FROM `BareMetal2ChassisPciDeviceVO` bm + WHERE bm.`uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`) +) +AND NOT EXISTS ( + SELECT 1 FROM `HostEO` h WHERE h.`uuid` = CONCAT('ph-', SUBSTRING(c.`uuid`, 1, 29)) +); + +-- Step 2: Migrate PCI base data to PciDeviceVO +INSERT INTO `PciDeviceVO` ( + `uuid`, `name`, `description`, + `hostUuid`, + `type`, `state`, `status`, `virtStatus`, + `vendorId`, `deviceId`, `subvendorId`, `subdeviceId`, + `pciDeviceAddress`, `iommuGroup`, + `vendor`, `device`, + `createDate`, `lastOpDate` +) +SELECT + bm.`uuid`, bm.`name`, bm.`description`, + CONCAT('ph-', SUBSTRING(bm.`chassisUuid`, 1, 29)), + bm.`type`, 'Enabled', 'Active', 'UNVIRTUALIZABLE', + bm.`vendorId`, bm.`deviceId`, bm.`subvendorId`, bm.`subdeviceId`, + bm.`pciDeviceAddress`, bm.`iommuGroup`, + bm.`vendor`, bm.`device`, + bm.`createDate`, bm.`lastOpDate` +FROM `BareMetal2ChassisPciDeviceVO` bm +WHERE bm.`uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`) +AND NOT EXISTS ( + SELECT 1 FROM `PciDeviceVO` p WHERE p.`uuid` = bm.`uuid` +); + +-- Step 3: Migrate GPU extension data to GpuDeviceVO +INSERT INTO `GpuDeviceVO` ( + `uuid`, `serialNumber`, `memory`, `power`, `isDriverLoaded`, + `scope`, `chassisUuid` +) +SELECT + bg.`uuid`, bg.`serialNumber`, bg.`memory`, bg.`power`, bg.`isDriverLoaded`, + 'BARE_METAL', bm.`chassisUuid` +FROM `BareMetal2ChassisGpuDeviceVO` bg +JOIN `BareMetal2ChassisPciDeviceVO` bm ON bg.`uuid` = bm.`uuid` +WHERE NOT EXISTS ( + SELECT 1 FROM `GpuDeviceVO` g WHERE g.`uuid` = bg.`uuid` +); + +-- Step 4: Mark old table as migrated (preserve for 1 version) +ALTER TABLE `BareMetal2ChassisPciDeviceVO` ADD COLUMN IF NOT EXISTS `_migrated` TINYINT(1) DEFAULT 0; +UPDATE `BareMetal2ChassisPciDeviceVO` SET `_migrated` = 1 + WHERE `uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`); + + END IF; +END$$ +DELIMITER ; + +CALL bm2_gpu_migrate(); +DROP PROCEDURE IF EXISTS bm2_gpu_migrate; diff --git a/conf/db/upgrade/V6.0.0.3__mdev_subtable.sql b/conf/db/upgrade/V6.0.0.3__mdev_subtable.sql new file mode 100644 index 0000000000..276918cbbf --- /dev/null +++ b/conf/db/upgrade/V6.0.0.3__mdev_subtable.sql @@ -0,0 +1,104 @@ +-- V6.0.0.3: Convert MdevDeviceVO from standalone to PciDeviceVO subtable + +DELIMITER $$ +DROP PROCEDURE IF EXISTS migrate_mdev_subtable$$ +CREATE PROCEDURE migrate_mdev_subtable() +BEGIN + -- Only proceed if old MdevDeviceVO exists and is NOT yet a subtable of PciDeviceVO + IF EXISTS (SELECT 1 FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'MdevDeviceVO') + AND NOT EXISTS (SELECT 1 FROM information_schema.TABLE_CONSTRAINTS + WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME = 'MdevDeviceVO' + AND CONSTRAINT_NAME = 'fkMdevDeviceVOPciDeviceVO') THEN + + -- Step 1: Migrate MdevDeviceVO shared fields into PciDeviceVO + INSERT INTO `PciDeviceVO` ( + `uuid`, `name`, `description`, + `hostUuid`, `parentUuid`, `vmInstanceUuid`, + `type`, `state`, `status`, `virtStatus`, `chooser`, + `vendor`, + `vendorId`, `deviceId`, `subvendorId`, `subdeviceId`, + `pciDeviceAddress`, + `createDate`, `lastOpDate` + ) + SELECT + m.`uuid`, m.`name`, m.`description`, + m.`hostUuid`, m.`parentUuid`, m.`vmInstanceUuid`, + m.`type`, + m.`state`, + m.`status`, + 'VFIO_MDEV_VIRTUAL', + m.`chooser`, + m.`vendor`, + p.`vendorId`, p.`deviceId`, p.`subvendorId`, p.`subdeviceId`, + m.`mdevDeviceAddress`, + m.`createDate`, m.`lastOpDate` + FROM `MdevDeviceVO` m + LEFT JOIN `PciDeviceVO` p ON m.`parentUuid` = p.`uuid` + WHERE NOT EXISTS ( + SELECT 1 FROM `PciDeviceVO` pci WHERE pci.`uuid` = m.`uuid` + ); + + -- Step 2: Rename old MdevDeviceVO table + RENAME TABLE `MdevDeviceVO` TO `MdevDeviceVO_old`; + + -- Step 3: Create new MdevDeviceVO as subtable of PciDeviceVO + CREATE TABLE `MdevDeviceVO` ( + `uuid` VARCHAR(32) NOT NULL UNIQUE, + `mdevSpecUuid` VARCHAR(32) DEFAULT NULL, + `mttyUuid` VARCHAR(32) DEFAULT NULL, + `mdevDeviceAddress` VARCHAR(128) DEFAULT NULL, + PRIMARY KEY (`uuid`), + CONSTRAINT `fkMdevDeviceVOPciDeviceVO` + FOREIGN KEY (`uuid`) REFERENCES `PciDeviceVO`(`uuid`) ON DELETE CASCADE, + CONSTRAINT `fkMdevDeviceVOMdevSpecVO` + FOREIGN KEY (`mdevSpecUuid`) REFERENCES `MdevDeviceSpecVO`(`uuid`) ON DELETE SET NULL, + CONSTRAINT `fkMdevDeviceVOMttyDeviceVO` + FOREIGN KEY (`mttyUuid`) REFERENCES `MttyDeviceVO`(`uuid`) ON DELETE CASCADE + ) ENGINE=InnoDB; + + -- Step 4: Migrate mdev-specific data to new subtable + INSERT IGNORE INTO `MdevDeviceVO` (`uuid`, `mdevSpecUuid`, `mttyUuid`, `mdevDeviceAddress`) + SELECT `uuid`, `mdevSpecUuid`, `mttyUuid`, `mdevDeviceAddress` + FROM `MdevDeviceVO_old`; + + END IF; + + -- Only run AccountResourceRefVO migration if MdevDeviceVO table exists + IF EXISTS (SELECT 1 FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'MdevDeviceVO') THEN + + -- Step 5: Migrate AccountResourceRefVO references (safe to re-run) + INSERT IGNORE INTO `AccountResourceRefVO` ( + `uuid`, `accountUuid`, `ownerAccountUuid`, `resourceUuid`, + `resourceType`, `concreteResourceType`, `createDate`, `lastOpDate` + ) + SELECT + REPLACE(UUID(), '-', ''), + ar.`accountUuid`, ar.`ownerAccountUuid`, + ar.`resourceUuid`, + 'PciDeviceVO', + 'org.zstack.pciDevice.PciDeviceVO', + NOW(), NOW() + FROM `AccountResourceRefVO` ar + WHERE ar.`resourceType` = 'MdevDeviceVO' + AND ar.`resourceUuid` IN (SELECT `uuid` FROM `MdevDeviceVO`) + AND NOT EXISTS ( + SELECT 1 FROM `AccountResourceRefVO` a2 + WHERE a2.`resourceUuid` = ar.`resourceUuid` AND a2.`resourceType` = 'PciDeviceVO' + ); + + -- Step 6: Remove old MdevDeviceVO AccountResourceRefVO entries + DELETE FROM `AccountResourceRefVO` + WHERE `resourceType` = 'MdevDeviceVO' + AND `resourceUuid` IN (SELECT `uuid` FROM `MdevDeviceVO`); + + END IF; + + -- MdevDeviceVO_old preserved for rollback verification (drop in next version) +END$$ +DELIMITER ; + +CALL migrate_mdev_subtable(); +DROP PROCEDURE IF EXISTS migrate_mdev_subtable; diff --git a/core/src/main/java/org/zstack/core/thread/AsyncTimer.java b/core/src/main/java/org/zstack/core/thread/AsyncTimer.java index 3f877ab061..54fe25b2ea 100755 --- a/core/src/main/java/org/zstack/core/thread/AsyncTimer.java +++ b/core/src/main/java/org/zstack/core/thread/AsyncTimer.java @@ -69,6 +69,17 @@ public void startRightNow() { } } + public void startWithDelay(long delay, TimeUnit delayUnit) { + if (cancelled.get()) { + throw new CloudRuntimeException("cannot start a cancelled timer"); + } + + cancel = thdf.submitTimeoutTask(this, delayUnit, delay); + if (logger.isTraceEnabled()) { + logger.trace(String.format("%s starts with delay %d %s", getName(), delay, delayUnit)); + } + } + public void cancel() { cancelled.set(true); if (cancel != null) { diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java index a245757517..bf01e6234e 100755 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java @@ -169,6 +169,9 @@ public class KVMHost extends HostBase implements Host { private KVMHostContext context; + // Per-host ping latency EMA for adaptive timeout (ZSTAC-67534) + private final PingLatencyEma pingLatencyEma = new PingLatencyEma(0.3, 3); + // ///////////////////// REST URL ////////////////////////// private String baseUrl; private String connectPath; @@ -5011,6 +5014,11 @@ public void run(FlowTrigger trigger, Map data) { cmd.hostUuid = self.getUuid(); cmd.kvmagentPhysicalMemoryUsageAlarmThreshold = gcf.getConfigValue(KVMGlobalConfig.CATEGORY, KVMGlobalConfig.KVMAGENT_PHYSICAL_MEMORY_USAGE_ALARM_THRESHOLD.getName(), Long.class); cmd.kvmagentPhysicalMemoryUsageHardLimit = gcf.getConfigValue(KVMGlobalConfig.CATEGORY, KVMGlobalConfig.KVMAGENT_PHYSICAL_MEMORY_USAGE_HARD_LIMIT.getName(), Long.class); + + long globalTimeout = HostGlobalConfig.PING_HOST_TIMEOUT.value(Long.class); + long adaptiveTimeout = pingLatencyEma.computeAdaptiveTimeout(globalTimeout); + final long pingStartMs = System.currentTimeMillis(); + restf.asyncJsonPost(pingPath, cmd, new JsonAsyncRESTCallback(trigger) { @Override public void fail(ErrorCode err) { @@ -5019,6 +5027,8 @@ public void fail(ErrorCode err) { @Override public void success(PingResponse ret) { + pingLatencyEma.update(System.currentTimeMillis() - pingStartMs); + if (!ret.isSuccess()) { trigger.fail(operr(ORG_ZSTACK_KVM_10091, "%s", ret.getError())); return; @@ -5047,7 +5057,7 @@ public void success(PingResponse ret) { public Class getReturnClass() { return PingResponse.class; } - },TimeUnit.SECONDS, HostGlobalConfig.PING_HOST_TIMEOUT.value(Long.class)); + },TimeUnit.SECONDS, adaptiveTimeout); } }); diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchAction.java b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchAction.java new file mode 100644 index 0000000000..b7b7ace265 --- /dev/null +++ b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchAction.java @@ -0,0 +1,107 @@ +package org.zstack.sdk; + +import java.util.HashMap; +import java.util.Map; +import org.zstack.sdk.*; + +public class GetGpuMetricsBatchAction extends AbstractAction { + + private static final HashMap parameterMap = new HashMap<>(); + + private static final HashMap nonAPIParameterMap = new HashMap<>(); + + public static class Result { + public ErrorCode error; + public org.zstack.sdk.GetGpuMetricsBatchResult value; + + public Result throwExceptionIfError() { + if (error != null) { + throw new ApiException( + String.format("error[code: %s, description: %s, details: %s, globalErrorCode: %s]", error.code, error.description, error.details, error.globalErrorCode) + ); + } + + return this; + } + } + + @Param(required = true, nonempty = true, nullElements = false, emptyString = true, noTrim = false) + public java.util.List gpuDeviceUuids; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.util.List metricNames; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.lang.Long startTime; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.lang.Long endTime; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.lang.Integer period; + + @Param(required = false) + public java.util.List systemTags; + + @Param(required = false) + public java.util.List userTags; + + @Param(required = false) + public String sessionId; + + @Param(required = false) + public String accessKeyId; + + @Param(required = false) + public String accessKeySecret; + + @Param(required = false) + public String requestIp; + + + private Result makeResult(ApiResult res) { + Result ret = new Result(); + if (res.error != null) { + ret.error = res.error; + return ret; + } + + org.zstack.sdk.GetGpuMetricsBatchResult value = res.getResult(org.zstack.sdk.GetGpuMetricsBatchResult.class); + ret.value = value == null ? new org.zstack.sdk.GetGpuMetricsBatchResult() : value; + + return ret; + } + + public Result call() { + ApiResult res = ZSClient.call(this); + return makeResult(res); + } + + public void call(final Completion completion) { + ZSClient.call(this, new InternalCompletion() { + @Override + public void complete(ApiResult res) { + completion.complete(makeResult(res)); + } + }); + } + + protected Map getParameterMap() { + return parameterMap; + } + + protected Map getNonAPIParameterMap() { + return nonAPIParameterMap; + } + + protected RestInfo getRestInfo() { + RestInfo info = new RestInfo(); + info.httpMethod = "GET"; + info.path = "/gpu-device/metrics/batch"; + info.needSession = true; + info.needPoll = false; + info.parameterName = ""; + return info; + } + +} diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchResult.java b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchResult.java new file mode 100644 index 0000000000..195301f5b0 --- /dev/null +++ b/sdk/src/main/java/org/zstack/sdk/GetGpuMetricsBatchResult.java @@ -0,0 +1,14 @@ +package org.zstack.sdk; + + + +public class GetGpuMetricsBatchResult { + public java.util.Map metrics; + public void setMetrics(java.util.Map metrics) { + this.metrics = metrics; + } + public java.util.Map getMetrics() { + return this.metrics; + } + +} diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeAction.java b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeAction.java new file mode 100644 index 0000000000..7b3f09581e --- /dev/null +++ b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeAction.java @@ -0,0 +1,107 @@ +package org.zstack.sdk; + +import java.util.HashMap; +import java.util.Map; +import org.zstack.sdk.*; + +public class GetGpuResourceTreeAction extends AbstractAction { + + private static final HashMap parameterMap = new HashMap<>(); + + private static final HashMap nonAPIParameterMap = new HashMap<>(); + + public static class Result { + public ErrorCode error; + public org.zstack.sdk.GetGpuResourceTreeResult value; + + public Result throwExceptionIfError() { + if (error != null) { + throw new ApiException( + String.format("error[code: %s, description: %s, details: %s, globalErrorCode: %s]", error.code, error.description, error.details, error.globalErrorCode) + ); + } + + return this; + } + } + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.util.List zoneUuids; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.util.List clusterUuids; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.util.List hostUuids; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.lang.String scope; + + @Param(required = false, nonempty = false, nullElements = false, emptyString = true, noTrim = false) + public java.lang.String allocateStatus; + + @Param(required = false) + public java.util.List systemTags; + + @Param(required = false) + public java.util.List userTags; + + @Param(required = false) + public String sessionId; + + @Param(required = false) + public String accessKeyId; + + @Param(required = false) + public String accessKeySecret; + + @Param(required = false) + public String requestIp; + + + private Result makeResult(ApiResult res) { + Result ret = new Result(); + if (res.error != null) { + ret.error = res.error; + return ret; + } + + org.zstack.sdk.GetGpuResourceTreeResult value = res.getResult(org.zstack.sdk.GetGpuResourceTreeResult.class); + ret.value = value == null ? new org.zstack.sdk.GetGpuResourceTreeResult() : value; + + return ret; + } + + public Result call() { + ApiResult res = ZSClient.call(this); + return makeResult(res); + } + + public void call(final Completion completion) { + ZSClient.call(this, new InternalCompletion() { + @Override + public void complete(ApiResult res) { + completion.complete(makeResult(res)); + } + }); + } + + protected Map getParameterMap() { + return parameterMap; + } + + protected Map getNonAPIParameterMap() { + return nonAPIParameterMap; + } + + protected RestInfo getRestInfo() { + RestInfo info = new RestInfo(); + info.httpMethod = "GET"; + info.path = "/gpu-device/resource-tree"; + info.needSession = true; + info.needPoll = false; + info.parameterName = ""; + return info; + } + +} diff --git a/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeResult.java b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeResult.java new file mode 100644 index 0000000000..97262ae70c --- /dev/null +++ b/sdk/src/main/java/org/zstack/sdk/GetGpuResourceTreeResult.java @@ -0,0 +1,38 @@ +package org.zstack.sdk; + + + +public class GetGpuResourceTreeResult { + public java.util.List tree; + public void setTree(java.util.List tree) { + this.tree = tree; + } + public java.util.List getTree() { + return this.tree; + } + + public int totalGpuCount; + public void setTotalGpuCount(int totalGpuCount) { + this.totalGpuCount = totalGpuCount; + } + public int getTotalGpuCount() { + return this.totalGpuCount; + } + + public int allocatedGpuCount; + public void setAllocatedGpuCount(int allocatedGpuCount) { + this.allocatedGpuCount = allocatedGpuCount; + } + public int getAllocatedGpuCount() { + return this.allocatedGpuCount; + } + + public int unallocatedGpuCount; + public void setUnallocatedGpuCount(int unallocatedGpuCount) { + this.unallocatedGpuCount = unallocatedGpuCount; + } + public int getUnallocatedGpuCount() { + return this.unallocatedGpuCount; + } + +} diff --git a/sdk/src/main/java/org/zstack/sdk/GpuMetricsEntry.java b/sdk/src/main/java/org/zstack/sdk/GpuMetricsEntry.java new file mode 100644 index 0000000000..96fa15bae7 --- /dev/null +++ b/sdk/src/main/java/org/zstack/sdk/GpuMetricsEntry.java @@ -0,0 +1,94 @@ +package org.zstack.sdk; + + + +public class GpuMetricsEntry { + public java.lang.String gpuDeviceUuid; + public void setGpuDeviceUuid(java.lang.String gpuDeviceUuid) { + this.gpuDeviceUuid = gpuDeviceUuid; + } + public java.lang.String getGpuDeviceUuid() { + return this.gpuDeviceUuid; + } + + public java.lang.String hostUuid; + public void setHostUuid(java.lang.String hostUuid) { + this.hostUuid = hostUuid; + } + public java.lang.String getHostUuid() { + return this.hostUuid; + } + + public java.lang.String vmInstanceUuid; + public void setVmInstanceUuid(java.lang.String vmInstanceUuid) { + this.vmInstanceUuid = vmInstanceUuid; + } + public java.lang.String getVmInstanceUuid() { + return this.vmInstanceUuid; + } + + public java.lang.String pciDeviceAddress; + public void setPciDeviceAddress(java.lang.String pciDeviceAddress) { + this.pciDeviceAddress = pciDeviceAddress; + } + public java.lang.String getPciDeviceAddress() { + return this.pciDeviceAddress; + } + + public java.lang.Double utilization; + public void setUtilization(java.lang.Double utilization) { + this.utilization = utilization; + } + public java.lang.Double getUtilization() { + return this.utilization; + } + + public java.lang.Double memoryUtilization; + public void setMemoryUtilization(java.lang.Double memoryUtilization) { + this.memoryUtilization = memoryUtilization; + } + public java.lang.Double getMemoryUtilization() { + return this.memoryUtilization; + } + + public java.lang.Double temperature; + public void setTemperature(java.lang.Double temperature) { + this.temperature = temperature; + } + public java.lang.Double getTemperature() { + return this.temperature; + } + + public java.lang.Double powerDraw; + public void setPowerDraw(java.lang.Double powerDraw) { + this.powerDraw = powerDraw; + } + public java.lang.Double getPowerDraw() { + return this.powerDraw; + } + + public java.lang.Double fanSpeed; + public void setFanSpeed(java.lang.Double fanSpeed) { + this.fanSpeed = fanSpeed; + } + public java.lang.Double getFanSpeed() { + return this.fanSpeed; + } + + public java.lang.String gpuStatus; + public void setGpuStatus(java.lang.String gpuStatus) { + this.gpuStatus = gpuStatus; + } + public java.lang.String getGpuStatus() { + return this.gpuStatus; + } + + public java.util.Map extraMetrics; + public void setExtraMetrics(java.util.Map extraMetrics) { + this.extraMetrics = extraMetrics; + } + public java.util.Map getExtraMetrics() { + return this.extraMetrics; + } + +} diff --git a/sdk/src/main/java/org/zstack/sdk/GpuResourceTreeNode.java b/sdk/src/main/java/org/zstack/sdk/GpuResourceTreeNode.java new file mode 100644 index 0000000000..9510f9d319 --- /dev/null +++ b/sdk/src/main/java/org/zstack/sdk/GpuResourceTreeNode.java @@ -0,0 +1,78 @@ +package org.zstack.sdk; + + + +public class GpuResourceTreeNode { + public java.lang.String uuid; + public void setUuid(java.lang.String uuid) { + this.uuid = uuid; + } + public java.lang.String getUuid() { + return this.uuid; + } + + public java.lang.String name; + public void setName(java.lang.String name) { + this.name = name; + } + public java.lang.String getName() { + return this.name; + } + + public java.lang.String nodeType; + public void setNodeType(java.lang.String nodeType) { + this.nodeType = nodeType; + } + public java.lang.String getNodeType() { + return this.nodeType; + } + + public int totalGpuCount; + public void setTotalGpuCount(int totalGpuCount) { + this.totalGpuCount = totalGpuCount; + } + public int getTotalGpuCount() { + return this.totalGpuCount; + } + + public int allocatedGpuCount; + public void setAllocatedGpuCount(int allocatedGpuCount) { + this.allocatedGpuCount = allocatedGpuCount; + } + public int getAllocatedGpuCount() { + return this.allocatedGpuCount; + } + + public int unallocatedGpuCount; + public void setUnallocatedGpuCount(int unallocatedGpuCount) { + this.unallocatedGpuCount = unallocatedGpuCount; + } + public int getUnallocatedGpuCount() { + return this.unallocatedGpuCount; + } + + public org.zstack.sdk.GpuDeviceInventory gpu; + public void setGpu(org.zstack.sdk.GpuDeviceInventory gpu) { + this.gpu = gpu; + } + public org.zstack.sdk.GpuDeviceInventory getGpu() { + return this.gpu; + } + + public int mdevChildrenCount; + public void setMdevChildrenCount(int mdevChildrenCount) { + this.mdevChildrenCount = mdevChildrenCount; + } + public int getMdevChildrenCount() { + return this.mdevChildrenCount; + } + + public java.util.List children; + public void setChildren(java.util.List children) { + this.children = children; + } + public java.util.List getChildren() { + return this.children; + } + +}