1 files changed, 106 insertions, 0 deletions
diff --git a/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch b/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch
new file mode 100644
index 0000000..4a46326
--- /dev/null
+++ b/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch
@@ -0,0 +1,106 @@
+From 846fb984b506135917c2862d2e4607005d6afdeb Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 2 Apr 2024 16:20:09 +0200
+Subject: [PATCH 65/67] x86/boot: Improve the boot watchdog determination of
+ stuck cpus
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Right now, check_nmi_watchdog() has two processing loops over all online CPUs
+using prev_nmi_count as storage.
+
+Use a cpumask_t instead (1/32th as much initdata) and have wait_for_nmis()
+make the determination of whether it is stuck, rather than having both
+functions needing to agree on how many ticks mean stuck.
+
+More importantly though, it means we can use the standard cpumask
+infrastructure, including turning this:
+
+  (XEN) Brought up 512 CPUs
+  (XEN) Testing NMI watchdog on all CPUs: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511} stuck
+
+into the rather more manageable:
+
+  (XEN) Brought up 512 CPUs
+  (XEN) Testing NMI watchdog on all CPUs: {0-511} stuck
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 9e18f339830c828798aef465556d4029d83476a0
+master date: 2024-03-19 18:29:37 +0000
+---
+ xen/arch/x86/nmi.c | 33 ++++++++++++++-------------------
+ 1 file changed, 14 insertions(+), 19 deletions(-)
+
+diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c
+index 7c9591b65e..dd31034ac8 100644
+--- a/xen/arch/x86/nmi.c
++++ b/xen/arch/x86/nmi.c
+@@ -150,6 +150,8 @@ int nmi_active;
+ 
+ static void __init cf_check wait_for_nmis(void *p)
+ {
++    cpumask_t *stuck_cpus = p;
++    unsigned int cpu = smp_processor_id();
+     unsigned int start_count = this_cpu(nmi_count);
+     unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
+     unsigned long s, e;
+@@ -158,42 +160,35 @@ static void __init cf_check wait_for_nmis(void *p)
+     do {
+         cpu_relax();
+         if ( this_cpu(nmi_count) >= start_count + 2 )
+-            break;
++            return;
++
+         e = rdtsc();
+-    } while( e - s < ticks );
++    } while ( e - s < ticks );
++
++    /* Timeout.  Mark ourselves as stuck. */
++    cpumask_set_cpu(cpu, stuck_cpus);
+ }
+ 
+ void __init check_nmi_watchdog(void)
+ {
+-    static unsigned int __initdata prev_nmi_count[NR_CPUS];
+-    int cpu;
+-    bool ok = true;
++    static cpumask_t __initdata stuck_cpus;
+ 
+     if ( nmi_watchdog == NMI_NONE )
+         return;
+ 
+     printk("Testing NMI watchdog on all CPUs:");
+ 
+-    for_each_online_cpu ( cpu )
+-        prev_nmi_count[cpu] = per_cpu(nmi_count, cpu);
+-
+     /*
+      * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
+      * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
+      * uses only runs while the core's not halted
+      */
+-    on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
+-
+-    for_each_online_cpu ( cpu )
+-    {
+-        if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 )
+-        {
+-            printk(" %d", cpu);
+-            ok = false;
+-        }
+-    }
++    on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1);
+ 
+-    printk(" %s\n", ok ? "ok" : "stuck");
++    if ( cpumask_empty(&stuck_cpus) )
++        printk("ok\n");
++    else
++        printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus));
+ 
+     /*
+      * Now that we know it works we can reduce NMI frequency to
+-- 
+2.44.0
+