From cb60e15c9e98f09534fd33801d0cd5966dff248b Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 28 May 2026 08:25:42 -0700 Subject: [PATCH 1/5] thermal: intel: Fix dangling resources on thermal_throttle_online() failure The function thermal_throttle_add_dev() may fail and abort a CPU hotplug online operation. Since the failure occurs within the online callback, thermal_throttle_online(), the CPU hotplug framework does not invoke the corresponding offline callback. As a result, the hardware and software resources set up during the failed operation are not torn down. Since only thermal_throttle_add_dev() can fail, call it before setting up the rest of the resources. Fixes: f6656208f04e ("x86/mce/therm_throt: Optimize notifications of thermal throttle") Signed-off-by: Ricardo Neri Link: https://lore.kernel.org/linux-pm/20260528-rneri-directed-therm-intr-v2-1-8e2f9e0c1a36@linux.intel.com/ Signed-off-by: WangYuli --- drivers/thermal/intel/therm_throt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index debc94e2dc169..7c21483c46b91 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -528,8 +528,13 @@ static int thermal_throttle_online(unsigned int cpu) { struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + int err; u32 l; + err = thermal_throttle_add_dev(dev, cpu); + if (err) + return err; + state->package_throttle.level = PACKAGE_LEVEL; state->core_throttle.level = CORE_LEVEL; @@ -547,7 +552,7 @@ static int thermal_throttle_online(unsigned int cpu) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - return thermal_throttle_add_dev(dev, cpu); + return err; } static int thermal_throttle_offline(unsigned int cpu) From d1629c02df14f95e5023bbd26ce8f8846ad8fbcc Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 28 May 2026 08:25:43 -0700 Subject: [PATCH 2/5] x86/thermal: Add bit definitions for Intel Directed Package Thermal Interrupt Add CPUID and MSR bit definitions required to support Intel Directed Package Thermal Interrupt. A CPU requests directed package-level thermal interrupts by setting bit 25 in IA32_THERM_INTERRUPT. Hardware acknowledges by setting bit 25 in IA32_PACKAGE_THERM_STATUS, indicating that only CPUs that opted in will receive the interrupt. If no CPU in the package requests it, delivery falls back to broadcast. Signed-off-by: Ricardo Neri [WangYuli: Fix conflicts] Link: https://lore.kernel.org/linux-pm/20260528-rneri-directed-therm-intr-v2-2-8e2f9e0c1a36@linux.intel.com/ Signed-off-by: WangYuli --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/include/asm/msr-index.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 9cf3725257530..f71704783d795 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -381,6 +381,8 @@ #define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ #define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */ +#define X86_FEATURE_DPTI (14*32+24) /* Intel Directed Package Thermal Interrupt */ + /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ #define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */ #define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d0a0cc8e8bd99..f609d856ef55a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -966,6 +966,7 @@ #define THERM_INT_HIGH_ENABLE (1 << 0) #define THERM_INT_LOW_ENABLE (1 << 1) #define THERM_INT_PLN_ENABLE (1 << 24) +#define THERM_INT_DPTI_ENABLE (1 << 25) #define MSR_IA32_THERM_STATUS 0x0000019c @@ -995,6 +996,7 @@ #define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) #define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) +#define PACKAGE_THERM_STATUS_DPTI_ACK (1 << 25) #define PACKAGE_THERM_STATUS_HFI_UPDATED (1 << 26) #define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 From 4471db670c9bf47181b2f3275963ba362541e35f Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 28 May 2026 08:25:44 -0700 Subject: [PATCH 3/5] thermal: intel: Enable the Directed Package-level Thermal Interrupt Package-level thermal interrupts are broadcast to all online CPUs within a package, even though only one CPU needs to service them. This results in unnecessary wakeups, lock contention, and corresponding performance and power-efficiency penalties. When supported by hardware, a CPU requests to receive directed package- level thermal interrupts by setting a designated bit in IA32_THERM_INTERRUPT. The operating system must then verify that hardware has acknowledged this request by checking a designated bit in IA32_PACKAGE_THERM_STATUS. Enable directed package-level thermal interrupts on one CPU per package using the CPU hotplug infrastructure. Keep track of the CPUs handling package-level interrupts with an array. If the handling CPU goes offline, select a new CPU. Temporarily enable directed interrupts on both the current and new CPU until hardware acknowledges the new selection, then disable them on the outgoing CPU. Systems without directed-interrupt support continue to broadcast the package-level interrupt to all CPUs. Also, add a rollback mechanism in the CPU hotplug online callback to fall back to broadcast mode if the directed-interrupt acknowledgment fails in any package. This is most important during boot, when all CPUs in a package come online and would otherwise keep retrying on faulty hardware. A complete rollback is not needed in the CPU hotplug offline callback since at that point the hardware is known to work. While here, update an inline comment to point to the correct volume of the Intel Software Developer's Manual. Signed-off-by: Ricardo Neri Link: https://lore.kernel.org/linux-pm/20260528-rneri-directed-therm-intr-v2-3-8e2f9e0c1a36@linux.intel.com/ Signed-off-by: WangYuli --- drivers/thermal/intel/therm_throt.c | 220 +++++++++++++++++++++++++++- 1 file changed, 217 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 7c21483c46b91..9fd3021aa051f 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -243,16 +244,23 @@ static void thermal_intr_init_pkg_clear_mask(void) * IA32_PACKAGE_THERM_STATUS. */ - /* All bits except BIT 26 depend on CPUID.06H: EAX[6] = 1 */ + /* All bits except BITs 25 and 26 depend on CPUID.06H: EAX[6] = 1 */ if (boot_cpu_has(X86_FEATURE_PTS)) therm_intr_pkg_clear_mask = (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)); /* - * Intel SDM Volume 2A: Thermal and Power Management Leaf + * Intel SDM Volume 1: Thermal and Power Management Leaf * Bit 26: CPUID.06H: EAX[19] = 1 */ if (boot_cpu_has(X86_FEATURE_HFI)) therm_intr_pkg_clear_mask |= BIT(26); + + /* + * Intel SDM Volume 1: Thermal and Power Management Leaf + * Bit 25: CPUID.06H: EAX[24] = 1 + */ + if (boot_cpu_has(X86_FEATURE_DPTI)) + therm_intr_pkg_clear_mask |= BIT(25); } /* @@ -523,6 +531,184 @@ static void thermal_throttle_remove_dev(struct device *dev) sysfs_remove_group(&dev->kobj, &thermal_attr_group); } +static int check_directed_thermal_pkg_intr_ack(void) +{ + unsigned int count = 15000; + u64 msr_val; + + /* + * Hardware acknowledges the directed interrupt setup in 10ms or less. + * Wait 15ms to be safe. + */ + do { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + udelay(1); + } while (!(msr_val & PACKAGE_THERM_STATUS_DPTI_ACK) && --count); + + if (!count) + return -ETIMEDOUT; + + thermal_clear_package_intr_status(PACKAGE_LEVEL, + PACKAGE_THERM_STATUS_DPTI_ACK); + + return 0; +} + +static void config_directed_thermal_pkg_intr(void *info) +{ + bool enable = *((bool *)info); + u64 msr_val; + + rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_val); + + if (enable) + msr_val |= THERM_INT_DPTI_ENABLE; + else + msr_val &= ~THERM_INT_DPTI_ENABLE; + + wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_val); +} + +/* Only accessed from CPU hotplug callbacks. No extra locking needed. */ +static unsigned int *directed_intr_handler_cpus; + +static bool directed_thermal_pkg_intr_supported(void) +{ + if (!boot_cpu_has(X86_FEATURE_DPTI)) + return false; + + if (!directed_intr_handler_cpus) + return false; + + return true; +} + +/* + * Must be called with cpu_hotplug_lock held to prevent CPUs from going offline + * while iterating through packages. Also, interrupts must be enabled to avoid + * deadlocks in SMP function calls. + */ +static void disable_all_directed_thermal_pkg_intr(void) +{ + bool enable = false; + int i; + + if (!directed_thermal_pkg_intr_supported()) + return; + + for (i = 0; i < topology_max_packages(); i++) { + if (directed_intr_handler_cpus[i] == nr_cpu_ids) + continue; + + smp_call_function_single(directed_intr_handler_cpus[i], + config_directed_thermal_pkg_intr, + &enable, true); + } + + kfree(directed_intr_handler_cpus); + directed_intr_handler_cpus = NULL; +} + +static void enable_directed_thermal_pkg_intr(unsigned int cpu) +{ + bool enable = true; + u16 pkg_id; + + if (!directed_thermal_pkg_intr_supported()) + return; + + pkg_id = topology_logical_package_id(cpu); + if (pkg_id >= topology_max_packages()) + return; + + /* Another CPU in this package already handles the directed interrupt. */ + if (directed_intr_handler_cpus[pkg_id] != nr_cpu_ids) + return; + + thermal_clear_package_intr_status(PACKAGE_LEVEL, + PACKAGE_THERM_STATUS_DPTI_ACK); + + config_directed_thermal_pkg_intr(&enable); + if (!check_directed_thermal_pkg_intr_ack()) { + directed_intr_handler_cpus[pkg_id] = cpu; + return; + } + + /* + * A failure indicates faulty hardware. Roll back completely so that + * no other CPU tries. This is especially important during boot as all + * CPUs may come online and would otherwise keep trying. + */ + enable = false; + config_directed_thermal_pkg_intr(&enable); + + disable_all_directed_thermal_pkg_intr(); + + pr_info_once("Failed to direct package thermal interrupts. All CPUs will receive it.\n"); +} + +static void disable_directed_thermal_pkg_intr(unsigned int cpu) +{ + unsigned int new_cpu; + bool enable; + u16 pkg_id; + + if (!directed_thermal_pkg_intr_supported()) + return; + + pkg_id = topology_logical_package_id(cpu); + if (pkg_id >= topology_max_packages()) + return; + + /* Not the CPU handling the directed interrupt. */ + if (directed_intr_handler_cpus[pkg_id] != cpu) + return; + + /* + * The package-level interrupt must remain directed after this CPU goes + * offline. + */ + new_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); + if (new_cpu < nr_cpu_ids) { + enable = true; + thermal_clear_package_intr_status(PACKAGE_LEVEL, + PACKAGE_THERM_STATUS_DPTI_ACK); + + /* + * We are here via CPU hotplug. Since we are holding the + * cpu_hotplug_lock, @new_cpu cannot go offline and interrupts + * are enabled, so the SMP function call is safe. + */ + smp_call_function_single(new_cpu, config_directed_thermal_pkg_intr, + &enable, true); + } + + /* + * If hardware does not acknowledge the directed interrupt setup on + * @new_cpu, disable the redirection. Since no other CPU is configured + * to receive the package-level interrupt, all CPUs in the package will + * receive it. + */ + enable = false; + if (new_cpu < nr_cpu_ids && check_directed_thermal_pkg_intr_ack()) { + smp_call_function_single(new_cpu, config_directed_thermal_pkg_intr, + &enable, true); + + pr_warn_once("Failed to redirect package thermal interrupt from CPU%u to CPU%u; reverting to broadcast.\n", + cpu, new_cpu); + + new_cpu = nr_cpu_ids; + } + + /* + * Clear the directed interrupt on @cpu. Hardware acknowledgment can be + * ignored since @cpu is going offline. + */ + config_directed_thermal_pkg_intr(&enable); + + directed_intr_handler_cpus[pkg_id] = (new_cpu < nr_cpu_ids) ? new_cpu : nr_cpu_ids; +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_online(unsigned int cpu) { @@ -548,6 +734,8 @@ static int thermal_throttle_online(unsigned int cpu) */ intel_hfi_online(cpu); + enable_directed_thermal_pkg_intr(cpu); + /* Unmask the thermal vector after the above workqueues are initialized. */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); @@ -565,6 +753,8 @@ static int thermal_throttle_offline(unsigned int cpu) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); + disable_directed_thermal_pkg_intr(cpu); + intel_hfi_offline(cpu); cancel_delayed_work_sync(&state->package_throttle.therm_work); @@ -577,6 +767,23 @@ static int thermal_throttle_offline(unsigned int cpu) return 0; } +static __init void init_directed_pkg_intr(void) +{ + int i; + + if (!boot_cpu_has(X86_FEATURE_DPTI)) + return; + + directed_intr_handler_cpus = kmalloc_array(topology_max_packages(), + sizeof(*directed_intr_handler_cpus), + GFP_KERNEL); + if (!directed_intr_handler_cpus) + return; + + for (i = 0; i < topology_max_packages(); i++) + directed_intr_handler_cpus[i] = nr_cpu_ids; +} + static __init int thermal_throttle_init_device(void) { int ret; @@ -584,12 +791,19 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; + init_directed_pkg_intr(); + intel_hfi_init(); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", thermal_throttle_online, thermal_throttle_offline); - return ret < 0 ? ret : 0; + if (ret >= 0) + return 0; + + disable_all_directed_thermal_pkg_intr(); + + return ret; } device_initcall(thermal_throttle_init_device); From f66a48516d4af482dde3e2beff4a10d213e06405 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 28 May 2026 08:25:45 -0700 Subject: [PATCH 4/5] thermal: intel: Add syscore callbacks for suspend and resume Directed package-level thermal interrupts are serviced by a single CPU per package. These handler CPUs are selected at boot through the CPU hotplug infrastructure. This mechanism is sufficient to restore the directed interrupt configuration when resuming from suspend for non-boot packages. It also keeps the handler-tracking array updated. For the boot package, CPU0 is chosen during boot because its CPU hotplug online callback runs first. However, this callback is not invoked on resume. The directed package-level interrupt configuration for the boot package is not restored. Add a syscore resume callback to re-enable directed package-level interrupts for this package. Disabling directed interrupts during suspend is required to keep the handler-tracking array in a consistent state for the boot package, allowing the correct configuration to be restored on resume. The resume callback must busy-wait for hardware acknowledgment of the directed interrupt setup. Otherwise, the handler-tracking array could be left in an inconsistent state. This implies running with interrupts disabled for up to 15ms, though in practice it takes less than 1ms. Signed-off-by: Ricardo Neri Link: https://lore.kernel.org/linux-pm/20260528-rneri-directed-therm-intr-v2-4-8e2f9e0c1a36@linux.intel.com/ Signed-off-by: WangYuli --- drivers/thermal/intel/therm_throt.c | 41 ++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 9fd3021aa051f..854e795f01154 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -14,6 +14,7 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ +#include #include #include #include @@ -569,7 +570,10 @@ static void config_directed_thermal_pkg_intr(void *info) wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_val); } -/* Only accessed from CPU hotplug callbacks. No extra locking needed. */ +/* + * Only accessed from CPU hotplug and syscore callbacks. No extra locking + * needed. + */ static unsigned int *directed_intr_handler_cpus; static bool directed_thermal_pkg_intr_supported(void) @@ -587,6 +591,10 @@ static bool directed_thermal_pkg_intr_supported(void) * Must be called with cpu_hotplug_lock held to prevent CPUs from going offline * while iterating through packages. Also, interrupts must be enabled to avoid * deadlocks in SMP function calls. + * + * The syscore resume callback may call this function but CPU hotplug is disabled + * in that context. It also runs with interrupts disabled, but no SMP function + * calls are issued because the directed interrupt was torn down before suspend. */ static void disable_all_directed_thermal_pkg_intr(void) { @@ -678,6 +686,10 @@ static void disable_directed_thermal_pkg_intr(unsigned int cpu) * We are here via CPU hotplug. Since we are holding the * cpu_hotplug_lock, @new_cpu cannot go offline and interrupts * are enabled, so the SMP function call is safe. + * + * The syscore suspend callback runs with interrupts disabled, + * but it does not reach this path because all the secondary + * CPUs are offline. */ smp_call_function_single(new_cpu, config_directed_thermal_pkg_intr, &enable, true); @@ -767,6 +779,31 @@ static int thermal_throttle_offline(unsigned int cpu) return 0; } +/* + * CPU0 may be handling the directed interrupt, but the CPU hotplug callbacks + * are not called for CPU0 during suspend and resume. + */ +static void directed_pkg_intr_syscore_resume(void *data) +{ + enable_directed_thermal_pkg_intr(0); +} + +static int directed_pkg_intr_syscore_suspend(void *data) +{ + disable_directed_thermal_pkg_intr(0); + + return 0; +} + +static const struct syscore_ops directed_pkg_intr_pm_ops = { + .resume = directed_pkg_intr_syscore_resume, + .suspend = directed_pkg_intr_syscore_suspend, +}; + +static struct syscore directed_pkg_intr_pm = { + .ops = &directed_pkg_intr_pm_ops, +}; + static __init void init_directed_pkg_intr(void) { int i; @@ -782,6 +819,8 @@ static __init void init_directed_pkg_intr(void) for (i = 0; i < topology_max_packages(); i++) directed_intr_handler_cpus[i] = nr_cpu_ids; + + register_syscore(&directed_pkg_intr_pm); } static __init int thermal_throttle_init_device(void) From 87ef019528d7883ede4d61b4d4b2df2da5a2ff02 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 28 May 2026 08:25:46 -0700 Subject: [PATCH 5/5] thermal: intel: Add a syscore shutdown callback for kexec reboot A kexec reboot may load a kernel that does not support directed package- level thermal interrupts. Without a shutdown callback, the directed interrupt configuration remains enabled across kexec but will not be handled correctly. In particular, if the CPU designated to receive the directed interrupt goes offline, no other CPU in the package will receive it. Add a syscore shutdown callback to disable directed package-level thermal interrupts on all packages before a kexec reboot. If the post-kexec kernel does not enable directed interrupts, it falls back to broadcasting the interrupt to all CPUs. Signed-off-by: Ricardo Neri Link: https://lore.kernel.org/linux-pm/20260528-rneri-directed-therm-intr-v2-5-8e2f9e0c1a36@linux.intel.com/ Signed-off-by: WangYuli --- drivers/thermal/intel/therm_throt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 854e795f01154..ec50519971b1c 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -795,9 +795,15 @@ static int directed_pkg_intr_syscore_suspend(void *data) return 0; } +static void directed_pkg_intr_syscore_shutdown(void *data) +{ + disable_all_directed_thermal_pkg_intr(); +} + static const struct syscore_ops directed_pkg_intr_pm_ops = { .resume = directed_pkg_intr_syscore_resume, .suspend = directed_pkg_intr_syscore_suspend, + .shutdown = directed_pkg_intr_syscore_shutdown, }; static struct syscore directed_pkg_intr_pm = {