-
Notifications
You must be signed in to change notification settings - Fork 0
<refactor>[pciDevice]: Flyway migrations for GPU scope, BM2 phantom host, MdevDeviceVO subtable #3394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 5.5.12
Are you sure you want to change the base?
<refactor>[pciDevice]: Flyway migrations for GPU scope, BM2 phantom host, MdevDeviceVO subtable #3394
Changes from all commits
b4125ee
bd225b0
a9a0ab8
e657076
1ffc48c
db299ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| package org.zstack.compute.host; | ||
|
|
||
| /** | ||
| * Exponential Moving Average tracker for per-host ping latency. | ||
| * Used to compute adaptive ping timeouts (ZSTAC-67534). | ||
| */ | ||
| public class PingLatencyEma { | ||
| private volatile double emaMs = -1; | ||
| private final double alpha; | ||
| private final int timeoutMultiplier; | ||
|
|
||
| public PingLatencyEma(double alpha, int timeoutMultiplier) { | ||
| this.alpha = alpha; | ||
| this.timeoutMultiplier = timeoutMultiplier; | ||
| } | ||
|
|
||
| public void update(long latencyMs) { | ||
| if (emaMs < 0) { | ||
| emaMs = latencyMs; | ||
| } else { | ||
| emaMs = alpha * latencyMs + (1 - alpha) * emaMs; | ||
| } | ||
| } | ||
|
|
||
| public long computeAdaptiveTimeout(long globalTimeoutSeconds) { | ||
| if (emaMs <= 0) { | ||
| return globalTimeoutSeconds; | ||
| } | ||
| long emaBasedTimeout = (long) (emaMs * timeoutMultiplier / 1000) + 1; | ||
| return Math.min(Math.max(globalTimeoutSeconds, emaBasedTimeout), globalTimeoutSeconds * 3); | ||
|
Comment on lines
+12
to
+30
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 缺少输入边界校验,可能导致自适应超时计算失真。 Line 12-30 建议至少约束: 建议修复 public PingLatencyEma(double alpha, int timeoutMultiplier) {
+ if (alpha <= 0 || alpha > 1) {
+ throw new IllegalArgumentException("alpha must be in (0, 1]");
+ }
+ if (timeoutMultiplier <= 0) {
+ throw new IllegalArgumentException("timeoutMultiplier must be greater than 0");
+ }
this.alpha = alpha;
this.timeoutMultiplier = timeoutMultiplier;
}
public void update(long latencyMs) {
+ if (latencyMs < 0) {
+ throw new IllegalArgumentException("latencyMs cannot be negative");
+ }
if (emaMs < 0) {
emaMs = latencyMs;
} else {
emaMs = alpha * latencyMs + (1 - alpha) * emaMs;
}
}
public long computeAdaptiveTimeout(long globalTimeoutSeconds) {
+ if (globalTimeoutSeconds <= 0) {
+ throw new IllegalArgumentException("globalTimeoutSeconds must be greater than 0");
+ }
if (emaMs <= 0) {
return globalTimeoutSeconds;
}🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| public double getEmaMs() { | ||
| return emaMs; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| package org.zstack.compute.host; | ||
|
|
||
| import org.junit.Assert; | ||
| import org.junit.Test; | ||
|
|
||
| import java.util.concurrent.ThreadLocalRandom; | ||
|
|
||
| /** | ||
| * ZSTAC-61971: Verify jitter produces uniform distribution, not thundering herd. | ||
| */ | ||
| public class HostTrackJitterTest { | ||
|
|
||
| /** | ||
| * AC: Jitter 测试 — 重连时间戳呈均匀分布 (非全部 t=0) | ||
| * Simulates the jitter logic from HostTrackImpl.trackHost() for 3000 hosts. | ||
|
Comment on lines
+14
to
+15
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 测试注释包含中文,违反仓库统一英文规范。 Line 14 和 Line 56 的注释建议改为英文,避免后续静态检查或规范审查失败。 建议修复- * AC: Jitter 测试 — 重连时间戳呈均匀分布 (非全部 t=0)
+ * AC: Jitter test — reconnection timestamps should be uniformly distributed (not all at t=0).
@@
- * AC: Not all hosts start at t=0.
+ * AC: Not all hosts start at t=0.Also applies to: 56-57 🤖 Prompt for AI Agents |
||
| */ | ||
| @Test | ||
| public void testJitterDistribution() { | ||
| long pingIntervalSeconds = 60; | ||
| long intervalMs = pingIntervalSeconds * 1000; | ||
| int hostCount = 3000; | ||
| int bucketCount = 10; | ||
| int[] buckets = new int[bucketCount]; | ||
|
|
||
| for (int i = 0; i < hostCount; i++) { | ||
| long jitterMs = ThreadLocalRandom.current().nextLong(intervalMs); | ||
| int bucket = (int) (jitterMs * bucketCount / intervalMs); | ||
| buckets[bucket]++; | ||
| } | ||
|
|
||
| // Each bucket should have ~300 hosts (3000/10). | ||
| // With uniform distribution, no bucket should be < 200 or > 400. | ||
| int expectedPerBucket = hostCount / bucketCount; | ||
| for (int i = 0; i < bucketCount; i++) { | ||
| Assert.assertTrue( | ||
| String.format("bucket %d has %d hosts, expected ~%d (uniform distribution)", | ||
| i, buckets[i], expectedPerBucket), | ||
| buckets[i] > expectedPerBucket / 2 && buckets[i] < expectedPerBucket * 2); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Verify jitter range is [0, intervalMs) — never negative, never >= intervalMs. | ||
| */ | ||
| @Test | ||
| public void testJitterRange() { | ||
| long intervalMs = 60000; | ||
| for (int i = 0; i < 10000; i++) { | ||
| long jitter = ThreadLocalRandom.current().nextLong(intervalMs); | ||
| Assert.assertTrue("jitter should be >= 0", jitter >= 0); | ||
| Assert.assertTrue("jitter should be < intervalMs", jitter < intervalMs); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * AC: Not all hosts start at t=0. | ||
| * At least 90% of hosts should have jitter > 0. | ||
| */ | ||
| @Test | ||
| public void testNotAllStartAtZero() { | ||
| long intervalMs = 60000; | ||
| int hostCount = 3000; | ||
| int zeroCount = 0; | ||
| for (int i = 0; i < hostCount; i++) { | ||
| long jitter = ThreadLocalRandom.current().nextLong(intervalMs); | ||
| if (jitter == 0) { | ||
| zeroCount++; | ||
| } | ||
| } | ||
| // With 60000ms range, probability of jitter=0 is 1/60000. | ||
| // For 3000 hosts, expected zeros ≈ 0.05. Definitely < 10% of hosts. | ||
| Assert.assertTrue( | ||
| String.format("too many hosts at t=0: %d/%d", zeroCount, hostCount), | ||
| zeroCount < hostCount / 10); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| package org.zstack.compute.host; | ||
|
|
||
| import org.junit.Assert; | ||
| import org.junit.Test; | ||
|
|
||
| public class PingLatencyEmaTest { | ||
|
|
||
| /** | ||
| * AC: 合成延迟序列 [100, 200, 150, 300, 250]ms → EMA 产出正确自适应超时 | ||
| */ | ||
|
Comment on lines
+8
to
+10
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 请将中文注释改为英文注释。 Line 9 的 Javadoc 含中文,违反仓库统一规范,建议改为准确英文描述。 建议修改- /**
- * AC: 合成延迟序列 [100, 200, 150, 300, 250]ms → EMA 产出正确自适应超时
- */
+ /**
+ * AC: Given synthetic latency sequence [100, 200, 150, 300, 250] ms,
+ * verify EMA values and adaptive timeout calculations are correct.
+ */As per coding guidelines “代码里不应当有中文,包括报错、注释等都应当使用正确的、无拼写错误的英文来写”。 🤖 Prompt for AI Agents |
||
| @Test | ||
| public void testEmaWithSyntheticLatencySequence() { | ||
| PingLatencyEma ema = new PingLatencyEma(0.3, 3); | ||
|
|
||
| // First sample: EMA = 100 | ||
| ema.update(100); | ||
| Assert.assertEquals(100.0, ema.getEmaMs(), 0.01); | ||
|
|
||
| // Second: EMA = 0.3*200 + 0.7*100 = 60 + 70 = 130 | ||
| ema.update(200); | ||
| Assert.assertEquals(130.0, ema.getEmaMs(), 0.01); | ||
|
|
||
| // Third: EMA = 0.3*150 + 0.7*130 = 45 + 91 = 136 | ||
| ema.update(150); | ||
| Assert.assertEquals(136.0, ema.getEmaMs(), 0.01); | ||
|
|
||
| // Fourth: EMA = 0.3*300 + 0.7*136 = 90 + 95.2 = 185.2 | ||
| ema.update(300); | ||
| Assert.assertEquals(185.2, ema.getEmaMs(), 0.01); | ||
|
|
||
| // Fifth: EMA = 0.3*250 + 0.7*185.2 = 75 + 129.64 = 204.64 | ||
| ema.update(250); | ||
| Assert.assertEquals(204.64, ema.getEmaMs(), 0.01); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAdaptiveTimeoutUsesGlobalWhenNoSamples() { | ||
| PingLatencyEma ema = new PingLatencyEma(0.3, 3); | ||
| // No samples yet → return global timeout | ||
| Assert.assertEquals(30, ema.computeAdaptiveTimeout(30)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAdaptiveTimeoutNeverBelowGlobal() { | ||
| PingLatencyEma ema = new PingLatencyEma(0.3, 3); | ||
| // Low latency: EMA=50ms → ema*3/1000+1 = 1s, but global=30s wins | ||
| ema.update(50); | ||
| Assert.assertEquals(30, ema.computeAdaptiveTimeout(30)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAdaptiveTimeoutIncreasesForHighLatency() { | ||
| PingLatencyEma ema = new PingLatencyEma(0.3, 3); | ||
| // High latency: EMA=15000ms → ema*3/1000+1 = 46s > global 30s | ||
| ema.update(15000); | ||
| long timeout = ema.computeAdaptiveTimeout(30); | ||
| Assert.assertEquals(46, timeout); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAdaptiveTimeoutCappedAt3xGlobal() { | ||
| PingLatencyEma ema = new PingLatencyEma(0.3, 3); | ||
| // Very high latency: EMA=100000ms → ema*3/1000+1 = 301s, cap at 3*30=90s | ||
| ema.update(100000); | ||
| long timeout = ema.computeAdaptiveTimeout(30); | ||
| Assert.assertEquals(90, timeout); | ||
| } | ||
|
|
||
| @Test | ||
| public void testEmaConvergesAfterLatencyDrop() { | ||
| PingLatencyEma ema = new PingLatencyEma(0.3, 3); | ||
| // Spike then drop | ||
| ema.update(20000); // EMA = 20000 | ||
| ema.update(100); // EMA = 0.3*100 + 0.7*20000 = 14030 | ||
|
|
||
| // EMA still elevated — ema*3/1000+1 = 43s > global 30s | ||
| long timeout = ema.computeAdaptiveTimeout(30); | ||
| Assert.assertEquals(43, timeout); | ||
| Assert.assertTrue("timeout should be > global after spike", timeout > 30); | ||
|
|
||
| // After many low-latency pings, EMA converges down | ||
| for (int i = 0; i < 30; i++) { | ||
| ema.update(100); | ||
| } | ||
| // EMA should be close to 100ms now → timeout = max(30, 1) = 30 | ||
| Assert.assertEquals(30, ema.computeAdaptiveTimeout(30)); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| -- V6.0.0.1: Add scope and chassisUuid to GpuDeviceVO for unified GPU management | ||
|
|
||
| DELIMITER $$ | ||
| DROP PROCEDURE IF EXISTS add_gpu_scope_columns$$ | ||
| CREATE PROCEDURE add_gpu_scope_columns() | ||
| BEGIN | ||
| IF NOT EXISTS (SELECT 1 FROM information_schema.COLUMNS | ||
| WHERE TABLE_SCHEMA = DATABASE() | ||
| AND TABLE_NAME = 'GpuDeviceVO' | ||
| AND COLUMN_NAME = 'scope') THEN | ||
| ALTER TABLE `GpuDeviceVO` ADD COLUMN `scope` VARCHAR(32) DEFAULT 'VM' NOT NULL; | ||
| END IF; | ||
|
|
||
| IF NOT EXISTS (SELECT 1 FROM information_schema.COLUMNS | ||
| WHERE TABLE_SCHEMA = DATABASE() | ||
| AND TABLE_NAME = 'GpuDeviceVO' | ||
| AND COLUMN_NAME = 'chassisUuid') THEN | ||
| ALTER TABLE `GpuDeviceVO` ADD COLUMN `chassisUuid` VARCHAR(32) DEFAULT NULL; | ||
| END IF; | ||
|
|
||
| -- Mark HAMI-virtualized GPUs as CONTAINER scope | ||
| UPDATE `GpuDeviceVO` g | ||
| JOIN `PciDeviceVO` p ON g.`uuid` = p.`uuid` | ||
| SET g.`scope` = 'CONTAINER' | ||
| WHERE p.`virtStatus` = 'HAMI_VIRTUALIZED'; | ||
|
|
||
| -- Index for scope-based queries | ||
| IF NOT EXISTS (SELECT 1 FROM information_schema.STATISTICS | ||
| WHERE TABLE_SCHEMA = DATABASE() | ||
| AND TABLE_NAME = 'GpuDeviceVO' | ||
| AND INDEX_NAME = 'idxGpuDeviceVOScope') THEN | ||
| CREATE INDEX `idxGpuDeviceVOScope` ON `GpuDeviceVO` (`scope`); | ||
| END IF; | ||
| END$$ | ||
| DELIMITER ; | ||
|
|
||
| CALL add_gpu_scope_columns(); | ||
| DROP PROCEDURE IF EXISTS add_gpu_scope_columns; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| -- V6.0.0.2: Migrate BareMetal2 GPU devices to unified GpuDeviceVO via phantom hosts | ||
| -- Conditional: only runs if BareMetal2 tables exist (skipped in non-BM2 deployments) | ||
|
|
||
| DELIMITER $$ | ||
| DROP PROCEDURE IF EXISTS bm2_gpu_migrate$$ | ||
| CREATE PROCEDURE bm2_gpu_migrate() | ||
| BEGIN | ||
| IF (SELECT COUNT(*) FROM information_schema.TABLES | ||
| WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'BareMetal2ChassisGpuDeviceVO') > 0 THEN | ||
|
|
||
| -- Step 1: Create phantom hosts for each BM2 chassis that has GPU devices | ||
| INSERT INTO `HostEO` ( | ||
| `uuid`, `name`, `description`, | ||
| `zoneUuid`, `clusterUuid`, | ||
| `managementIp`, `hypervisorType`, `state`, `status`, | ||
| `createDate`, `lastOpDate` | ||
| ) | ||
| SELECT | ||
| CONCAT('ph-', SUBSTRING(c.`uuid`, 1, 29)), | ||
| CONCAT('[BM2] ', c.`name`), | ||
| CONCAT('Phantom host for BareMetal2 chassis ', c.`uuid`), | ||
| c.`zoneUuid`, | ||
| c.`clusterUuid`, | ||
| '', | ||
| 'BareMetal2', | ||
| 'Enabled', | ||
| 'Connected', | ||
| c.`createDate`, | ||
| NOW() | ||
| FROM `BareMetal2ChassisVO` c | ||
| WHERE c.`uuid` IN ( | ||
| SELECT DISTINCT bm.`chassisUuid` | ||
| FROM `BareMetal2ChassisPciDeviceVO` bm | ||
| WHERE bm.`uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`) | ||
| ) | ||
| AND NOT EXISTS ( | ||
| SELECT 1 FROM `HostEO` h WHERE h.`uuid` = CONCAT('ph-', SUBSTRING(c.`uuid`, 1, 29)) | ||
| ); | ||
|
|
||
| -- Step 2: Migrate PCI base data to PciDeviceVO | ||
| INSERT INTO `PciDeviceVO` ( | ||
| `uuid`, `name`, `description`, | ||
| `hostUuid`, | ||
| `type`, `state`, `status`, `virtStatus`, | ||
| `vendorId`, `deviceId`, `subvendorId`, `subdeviceId`, | ||
| `pciDeviceAddress`, `iommuGroup`, | ||
| `vendor`, `device`, | ||
| `createDate`, `lastOpDate` | ||
| ) | ||
| SELECT | ||
| bm.`uuid`, bm.`name`, bm.`description`, | ||
| CONCAT('ph-', SUBSTRING(bm.`chassisUuid`, 1, 29)), | ||
| bm.`type`, 'Enabled', 'Active', 'UNVIRTUALIZABLE', | ||
| bm.`vendorId`, bm.`deviceId`, bm.`subvendorId`, bm.`subdeviceId`, | ||
| bm.`pciDeviceAddress`, bm.`iommuGroup`, | ||
| bm.`vendor`, bm.`device`, | ||
| bm.`createDate`, bm.`lastOpDate` | ||
| FROM `BareMetal2ChassisPciDeviceVO` bm | ||
| WHERE bm.`uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`) | ||
| AND NOT EXISTS ( | ||
| SELECT 1 FROM `PciDeviceVO` p WHERE p.`uuid` = bm.`uuid` | ||
| ); | ||
|
|
||
| -- Step 3: Migrate GPU extension data to GpuDeviceVO | ||
| INSERT INTO `GpuDeviceVO` ( | ||
| `uuid`, `serialNumber`, `memory`, `power`, `isDriverLoaded`, | ||
| `scope`, `chassisUuid` | ||
| ) | ||
| SELECT | ||
| bg.`uuid`, bg.`serialNumber`, bg.`memory`, bg.`power`, bg.`isDriverLoaded`, | ||
| 'BARE_METAL', bm.`chassisUuid` | ||
| FROM `BareMetal2ChassisGpuDeviceVO` bg | ||
| JOIN `BareMetal2ChassisPciDeviceVO` bm ON bg.`uuid` = bm.`uuid` | ||
| WHERE NOT EXISTS ( | ||
| SELECT 1 FROM `GpuDeviceVO` g WHERE g.`uuid` = bg.`uuid` | ||
| ); | ||
|
|
||
| -- Step 4: Mark old table as migrated (preserve for 1 version) | ||
| ALTER TABLE `BareMetal2ChassisPciDeviceVO` ADD COLUMN IF NOT EXISTS `_migrated` TINYINT(1) DEFAULT 0; | ||
| UPDATE `BareMetal2ChassisPciDeviceVO` SET `_migrated` = 1 | ||
| WHERE `uuid` IN (SELECT `uuid` FROM `BareMetal2ChassisGpuDeviceVO`); | ||
|
|
||
| END IF; | ||
| END$$ | ||
| DELIMITER ; | ||
|
|
||
| CALL bm2_gpu_migrate(); | ||
| DROP PROCEDURE IF EXISTS bm2_gpu_migrate; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 11166
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 131
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 230
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 4452
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 130
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 39079
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 776
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 10645
🏁 Script executed:
Repository: MatheMatrix/zstack
Length of output: 2596
框架已通过
@GlobalConfigValidation(numberGreaterThan = 1)约束PING_HOST_INTERVAL> 1,但在此代码路径中仍建议增加防御性检查。PING_HOST_INTERVAL的定义已包含@GlobalConfigValidation(numberGreaterThan = 1)注解(HostGlobalConfig.java 第 26-27 行),框架层面会防止配置为 ≤ 1 的值。不过,若防御层失效或存在绕过机制,当前代码仍会因nextLong(bound <= 0)而抛IllegalArgumentException。建议保留一层防御性检查提升代码鲁棒性:🤖 Prompt for AI Agents