diff options
Diffstat (limited to '0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch')
-rw-r--r-- | 0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch b/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch new file mode 100644 index 0000000..4a46326 --- /dev/null +++ b/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch @@ -0,0 +1,106 @@ +From 846fb984b506135917c2862d2e4607005d6afdeb Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 2 Apr 2024 16:20:09 +0200 +Subject: [PATCH 65/67] x86/boot: Improve the boot watchdog determination of + stuck cpus +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Right now, check_nmi_watchdog() has two processing loops over all online CPUs +using prev_nmi_count as storage. + +Use a cpumask_t instead (1/32th as much initdata) and have wait_for_nmis() +make the determination of whether it is stuck, rather than having both +functions needing to agree on how many ticks mean stuck. + +More importantly though, it means we can use the standard cpumask +infrastructure, including turning this: + + (XEN) Brought up 512 CPUs + (XEN) Testing NMI watchdog on all CPUs: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511} stuck + +into the rather more manageable: + + (XEN) Brought up 512 CPUs + (XEN) Testing NMI watchdog on all CPUs: {0-511} stuck + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 9e18f339830c828798aef465556d4029d83476a0 +master date: 2024-03-19 18:29:37 +0000 +--- + xen/arch/x86/nmi.c | 33 ++++++++++++++------------------- + 1 file changed, 14 insertions(+), 19 deletions(-) + +diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c +index 7c9591b65e..dd31034ac8 100644 +--- a/xen/arch/x86/nmi.c ++++ b/xen/arch/x86/nmi.c +@@ -150,6 +150,8 @@ int nmi_active; + + static void __init cf_check wait_for_nmis(void *p) + { ++ cpumask_t *stuck_cpus = p; ++ unsigned int cpu = smp_processor_id(); + unsigned int start_count = this_cpu(nmi_count); + unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz; + unsigned long s, e; +@@ -158,42 +160,35 @@ static void __init cf_check wait_for_nmis(void *p) + do { + cpu_relax(); + if ( this_cpu(nmi_count) >= start_count + 2 ) +- break; ++ return; ++ + e = rdtsc(); +- } while( e - s < ticks ); ++ } while ( e - s < ticks ); ++ ++ /* Timeout. Mark ourselves as stuck. */ ++ cpumask_set_cpu(cpu, stuck_cpus); + } + + void __init check_nmi_watchdog(void) + { +- static unsigned int __initdata prev_nmi_count[NR_CPUS]; +- int cpu; +- bool ok = true; ++ static cpumask_t __initdata stuck_cpus; + + if ( nmi_watchdog == NMI_NONE ) + return; + + printk("Testing NMI watchdog on all CPUs:"); + +- for_each_online_cpu ( cpu ) +- prev_nmi_count[cpu] = per_cpu(nmi_count, cpu); +- + /* + * Wait at most 10 ticks for 2 watchdog NMIs on each CPU. + * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog + * uses only runs while the core's not halted + */ +- on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1); +- +- for_each_online_cpu ( cpu ) +- { +- if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 ) +- { +- printk(" %d", cpu); +- ok = false; +- } +- } ++ on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1); + +- printk(" %s\n", ok ? "ok" : "stuck"); ++ if ( cpumask_empty(&stuck_cpus) ) ++ printk("ok\n"); ++ else ++ printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus)); + + /* + * Now that we know it works we can reduce NMI frequency to +-- +2.44.0 + |