diff options
author | 2022-10-19 10:38:35 +0200 | |
---|---|---|
committer | 2022-10-19 10:38:35 +0200 | |
commit | 4a9cd4d8a4efd8f2e52483117b4009122393c6a6 (patch) | |
tree | 8a467d1618431f5353fa676ca58b32bb18ba21aa | |
parent | Xen 4.15.4-pre-patchset-0.1 (diff) | |
download | xen-upstream-patches-4a9cd4d8a4efd8f2e52483117b4009122393c6a6.tar.gz xen-upstream-patches-4a9cd4d8a4efd8f2e52483117b4009122393c6a6.tar.bz2 xen-upstream-patches-4a9cd4d8a4efd8f2e52483117b4009122393c6a6.zip |
Xen 4.15.4-pre-patchset-14.15.4-pre-patchset-1
Signed-off-by: Florian Schmaus <flow@gentoo.org>
68 files changed, 4858 insertions, 44 deletions
diff --git a/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch b/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch index 96eb282..32ff417 100644 --- a/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch +++ b/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch @@ -1,7 +1,7 @@ From f6e26ce7d9317abc41130ead6dc2443a7e2dde00 Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 12 Jul 2022 11:20:46 +0200 -Subject: [PATCH 01/21] build: fix exported variable name CFLAGS_stack_boundary +Subject: [PATCH 01/67] build: fix exported variable name CFLAGS_stack_boundary Exporting a variable with a dash doesn't work reliably, they may be striped from the environment when calling a sub-make or sub-shell. @@ -63,5 +63,5 @@ index e857c0f2cc2c..a5b2041f9b96 100644 obj-y := stub.o obj-$(XEN_BUILD_EFI) := $(filter-out %.init.o,$(EFIOBJ)) -- -2.35.1 +2.37.3 diff --git a/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch b/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch index 45e4cfd..9f2f8e4 100644 --- a/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch +++ b/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch @@ -1,7 +1,7 @@ From b89b932cfe86556c5de4ad56702aed83142e22a3 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 12 Jul 2022 11:21:14 +0200 -Subject: [PATCH 02/21] IOMMU/x86: work around bogus gcc12 warning in +Subject: [PATCH 02/67] IOMMU/x86: work around bogus gcc12 warning in hvm_gsi_eoi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -48,5 +48,5 @@ index 9544f3234e65..50865eec2c04 100644 /* -- -2.35.1 +2.37.3 diff --git a/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch b/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch index b79f4b3..777ef8a 100644 --- a/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch +++ b/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch @@ -2,7 +2,7 @@ From b53df5b4341fa97614ad064a7c8e781c88b6ed71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= <marmarek@invisiblethingslab.com> Date: Tue, 12 Jul 2022 11:22:09 +0200 -Subject: [PATCH 03/21] ehci-dbgp: fix selecting n-th ehci controller +Subject: [PATCH 03/67] ehci-dbgp: fix selecting n-th ehci controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -32,5 +32,5 @@ index c893d246defa..66b4811af24a 100644 dbgp->cap = find_dbgp(dbgp, num); if ( !dbgp->cap ) -- -2.35.1 +2.37.3 diff --git a/0004-tools-xenstored-Harden-corrupt.patch b/0004-tools-xenstored-Harden-corrupt.patch index 8b30166..62b7ec9 100644 --- a/0004-tools-xenstored-Harden-corrupt.patch +++ b/0004-tools-xenstored-Harden-corrupt.patch @@ -1,7 +1,7 @@ From 7fe638c28fa693d8bb8f9419de1220d4359a1b2d Mon Sep 17 00:00:00 2001 From: Julien Grall <jgrall@amazon.com> Date: Tue, 12 Jul 2022 11:23:01 +0200 -Subject: [PATCH 04/21] tools/xenstored: Harden corrupt() +Subject: [PATCH 04/67] tools/xenstored: Harden corrupt() At the moment, corrupt() is neither checking for allocation failure nor freeing the allocated memory. @@ -40,5 +40,5 @@ index 8033c1e0eb28..9172dd767140 100644 check_store(); } -- -2.35.1 +2.37.3 diff --git a/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch b/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch index 158e2b0..7d79c2e 100644 --- a/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch +++ b/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch @@ -1,7 +1,7 @@ From 799a8d49237a62ea0d33c3756a6a7f665b8389b2 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:23:32 +0200 -Subject: [PATCH 05/21] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle with +Subject: [PATCH 05/67] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle with legacy IBRS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -89,5 +89,5 @@ index 68f6c46c470c..12283573cdd5 100644 * Disable shadowing before updating the MSR. There are no SMP issues * here; only local processor ordering concerns. -- -2.35.1 +2.37.3 diff --git a/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch b/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch index 65670fd..965c965 100644 --- a/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch +++ b/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch @@ -1,7 +1,7 @@ From cd5081e8c31651e623d86532306b4c56bbcb6e6d Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:24:11 +0200 -Subject: [PATCH 06/21] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow +Subject: [PATCH 06/67] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow hardware STIBP hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -230,5 +230,5 @@ index eb7fb70e86f9..8212227ee02a 100644 /* * PV guests can poison the RSB to any virtual address from which -- -2.35.1 +2.37.3 diff --git a/0007-libxc-fix-compilation-error-with-gcc13.patch b/0007-libxc-fix-compilation-error-with-gcc13.patch index b46552f..9a1ca92 100644 --- a/0007-libxc-fix-compilation-error-with-gcc13.patch +++ b/0007-libxc-fix-compilation-error-with-gcc13.patch @@ -1,7 +1,7 @@ From 77deab4233b5d9ec5cf214fdc1652424fd4fc9d6 Mon Sep 17 00:00:00 2001 From: Charles Arnold <carnold@suse.com> Date: Tue, 12 Jul 2022 11:24:39 +0200 -Subject: [PATCH 07/21] libxc: fix compilation error with gcc13 +Subject: [PATCH 07/67] libxc: fix compilation error with gcc13 xc_psr.c:161:5: error: conflicting types for 'xc_psr_cmt_get_data' due to enum/integer mismatch; @@ -29,5 +29,5 @@ index 318920166c5e..2013200b9eff 100644 int xc_psr_cmt_enabled(xc_interface *xch); -- -2.35.1 +2.37.3 diff --git a/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch b/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch index 94f729b..22a1ebe 100644 --- a/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch +++ b/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch @@ -1,7 +1,7 @@ From 5be1f46f435f8b05608b1eae029cb17d8bd3a560 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:25:05 +0200 -Subject: [PATCH 08/21] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio +Subject: [PATCH 08/67] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio sub-option This was an oversight from when unpriv-mmio was introduced. @@ -28,5 +28,5 @@ index 8212227ee02a..06790897e496 100644 else if ( val > 0 ) rc = -EINVAL; -- -2.35.1 +2.37.3 diff --git a/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch b/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch index 1b8787f..53a8b70 100644 --- a/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch +++ b/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch @@ -1,7 +1,7 @@ From ae417706870333bb52ebcf33c527809cdd2d7265 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:25:40 +0200 -Subject: [PATCH 09/21] xen/cmdline: Extend parse_boolean() to signal a name +Subject: [PATCH 09/67] xen/cmdline: Extend parse_boolean() to signal a name match This will help parsing a sub-option which has boolean and non-boolean options @@ -83,5 +83,5 @@ index 1198c7c0b207..be7498135170 100644 int parse_boolean(const char *name, const char *s, const char *e); -- -2.35.1 +2.37.3 diff --git a/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch b/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch index a808523..36577d6 100644 --- a/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch +++ b/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch @@ -1,7 +1,7 @@ From 08bfd4d01185e94fda1be9dd79a981d890a9085e Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:26:14 +0200 -Subject: [PATCH 10/21] x86/spec-ctrl: Add fine-grained cmdline suboptions for +Subject: [PATCH 10/67] x86/spec-ctrl: Add fine-grained cmdline suboptions for primitives Support controling the PV/HVM suboption of msr-sc/rsb/md-clear, which @@ -133,5 +133,5 @@ index 06790897e496..225fe08259b3 100644 /* Xen's speculative sidechannel mitigation settings. */ -- -2.35.1 +2.37.3 diff --git a/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch b/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch index b597673..dc468c8 100644 --- a/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch +++ b/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch @@ -1,7 +1,7 @@ From f241cc48dabeef6cb0b381db62f2562b0a3970eb Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 12 Jul 2022 11:26:47 +0200 -Subject: [PATCH 11/21] tools/helpers: fix build of xen-init-dom0 with -Werror +Subject: [PATCH 11/67] tools/helpers: fix build of xen-init-dom0 with -Werror Missing prototype of asprintf() without _GNU_SOURCE. @@ -24,5 +24,5 @@ index c99224a4b607..b4861c9e8041 100644 #include <stdint.h> #include <string.h> -- -2.35.1 +2.37.3 diff --git a/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch b/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch index 898889b..74fee03 100644 --- a/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch +++ b/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch @@ -1,7 +1,7 @@ From d470a54087e0fbd813dae4d773ad0b830eeec4a1 Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 12 Jul 2022 11:26:58 +0200 -Subject: [PATCH 12/21] libxl: check return value of libxl__xs_directory in +Subject: [PATCH 12/67] libxl: check return value of libxl__xs_directory in name2bdf libxl__xs_directory() can potentially return NULL without setting `n`. @@ -34,5 +34,5 @@ index 92bf86b2bebd..a5f5cdf62b80 100644 for (i = 0; i < n; i++) { -- -2.35.1 +2.37.3 diff --git a/0013-update-Xen-version-to-4.15.4-pre.patch b/0013-update-Xen-version-to-4.15.4-pre.patch index 664e9df..8626fdd 100644 --- a/0013-update-Xen-version-to-4.15.4-pre.patch +++ b/0013-update-Xen-version-to-4.15.4-pre.patch @@ -1,7 +1,7 @@ From 505771bb1dffdf6f763fad18ee49a913b98abfea Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 12 Jul 2022 11:28:33 +0200 -Subject: [PATCH 13/21] update Xen version to 4.15.4-pre +Subject: [PATCH 13/67] update Xen version to 4.15.4-pre --- xen/Makefile | 2 +- @@ -21,5 +21,5 @@ index e9a88325c467..cd66bb3b1c84 100644 -include xen-version -- -2.35.1 +2.37.3 diff --git a/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch b/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch index 681282e..a21b4d8 100644 --- a/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch +++ b/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch @@ -1,7 +1,7 @@ From 156ab775769d39b2dfb048ccd34dee7e86ba83a2 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 14/21] x86/spec-ctrl: Rework spec_ctrl_flags context switching +Subject: [PATCH 14/67] x86/spec-ctrl: Rework spec_ctrl_flags context switching We are shortly going to need to context switch new bits in both the vcpu and S3 paths. Introduce SCF_IST_MASK and SCF_DOM_MASK, and rework d->arch.verw @@ -163,5 +163,5 @@ index 5a590bac44aa..66b00d511fc6 100644 .macro SPEC_CTRL_ENTRY_FROM_INTR_IST /* -- -2.35.1 +2.37.3 diff --git a/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch b/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch index 553dbd2..49351ae 100644 --- a/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch +++ b/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch @@ -1,7 +1,7 @@ From 2cfbca32b9dc3a8d6520549ff468a7f550daf1b1 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 28 Jun 2022 14:36:56 +0100 -Subject: [PATCH 15/21] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr +Subject: [PATCH 15/67] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr We are about to introduce SCF_ist_ibpb, at which point SCF_ist_wrmsr becomes ambiguous. @@ -106,5 +106,5 @@ index 66b00d511fc6..0ff1b118f882 100644 DO_SPEC_CTRL_EXIT_TO_XEN -- -2.35.1 +2.37.3 diff --git a/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch b/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch index 9ed0093..f114f6d 100644 --- a/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch +++ b/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch @@ -1,7 +1,7 @@ From c707015bf118df2c43e3a48b3774916322fca50a Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 4 Jul 2022 21:32:17 +0100 -Subject: [PATCH 16/21] x86/spec-ctrl: Rename opt_ibpb to opt_ibpb_ctxt_switch +Subject: [PATCH 16/67] x86/spec-ctrl: Rename opt_ibpb to opt_ibpb_ctxt_switch We are about to introduce the use of IBPB at different points in Xen, making opt_ibpb ambiguous. Rename it to opt_ibpb_ctxt_switch. @@ -93,5 +93,5 @@ index 6f8b0e09348e..fd8162ca9ab9 100644 extern int8_t opt_eager_fpu; extern int8_t opt_l1d_flush; -- -2.35.1 +2.37.3 diff --git a/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch b/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch index bae2818..e162148 100644 --- a/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch +++ b/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch @@ -1,7 +1,7 @@ From d7f5fb1e2abd0d56cada9bfcf96ab530d214d9aa Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 17/21] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST +Subject: [PATCH 17/67] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST We are shortly going to add a conditional IBPB in this path. @@ -102,5 +102,5 @@ index 0ff1b118f882..15e24cde00d1 100644 /* Opencoded UNLIKELY_START() with no condition. */ -- -2.35.1 +2.37.3 diff --git a/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch b/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch index 06efb27..1de9d4c 100644 --- a/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch +++ b/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch @@ -1,7 +1,7 @@ From f0d78e0c11d3984c74f34a7325f862dee93a5835 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Thu, 24 Feb 2022 13:44:33 +0000 -Subject: [PATCH 18/21] x86/spec-ctrl: Support IBPB-on-entry +Subject: [PATCH 18/67] x86/spec-ctrl: Support IBPB-on-entry We are going to need this to mitigate Branch Type Confusion on AMD/Hygon CPUs, but as we've talked about using it in other cases too, arrange to support it @@ -296,5 +296,5 @@ index 15e24cde00d1..9eb4ad9ab71d 100644 jz .L\@_skip_rsb -- -2.35.1 +2.37.3 diff --git a/0019-x86-cpuid-Enumeration-for-BTC_NO.patch b/0019-x86-cpuid-Enumeration-for-BTC_NO.patch index 91c38ee..a4444f4 100644 --- a/0019-x86-cpuid-Enumeration-for-BTC_NO.patch +++ b/0019-x86-cpuid-Enumeration-for-BTC_NO.patch @@ -1,7 +1,7 @@ From 2b29ac476fa0c91655906fac3512202e514ecbed Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 16 May 2022 15:48:24 +0100 -Subject: [PATCH 19/21] x86/cpuid: Enumeration for BTC_NO +Subject: [PATCH 19/67] x86/cpuid: Enumeration for BTC_NO BTC_NO indicates that hardware is not succeptable to Branch Type Confusion. @@ -102,5 +102,5 @@ index 9686c82ed75c..1bbc7da4b53c 100644 /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ -- -2.35.1 +2.37.3 diff --git a/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch b/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch index 9fd2fe0..4d12421 100644 --- a/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch +++ b/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch @@ -1,7 +1,7 @@ From 409976bed91f61fb7b053d536d2fc87cf3ad7018 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 15 Mar 2022 18:30:25 +0000 -Subject: [PATCH 20/21] x86/spec-ctrl: Enable Zen2 chickenbit +Subject: [PATCH 20/67] x86/spec-ctrl: Enable Zen2 chickenbit ... as instructed in the Branch Type Confusion whitepaper. @@ -101,5 +101,5 @@ index 1e743461e91d..b4a360723b14 100644 #define MSR_AMD64_DR0_ADDRESS_MASK 0xc0011027 #define MSR_AMD64_DR1_ADDRESS_MASK 0xc0011019 -- -2.35.1 +2.37.3 diff --git a/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch b/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch index 12ecc5b..b676ba3 100644 --- a/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch +++ b/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch @@ -1,7 +1,7 @@ From 35bf91d30f1a480dcf5bfd99b79384b2b283da7f Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 27 Jun 2022 19:29:40 +0100 -Subject: [PATCH 21/21] x86/spec-ctrl: Mitigate Branch Type Confusion when +Subject: [PATCH 21/67] x86/spec-ctrl: Mitigate Branch Type Confusion when possible Branch Type Confusion affects AMD/Hygon CPUs on Zen2 and earlier. To @@ -301,5 +301,5 @@ index 10cd0cd2518f..33e845991b0a 100644 extern int8_t opt_eager_fpu; extern int8_t opt_l1d_flush; -- -2.35.1 +2.37.3 diff --git a/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch b/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch new file mode 100644 index 0000000..81f5b9a --- /dev/null +++ b/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch @@ -0,0 +1,45 @@ +From 3859f3ee7e37323ae5e0014c07ba8d3a4d7890b2 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 26 Jul 2022 15:03:14 +0200 +Subject: [PATCH 22/67] x86/mm: correct TLB flush condition in _get_page_type() + +When this logic was moved, it was moved across the point where nx is +updated to hold the new type for the page. IOW originally it was +equivalent to using x (and perhaps x would better have been used), but +now it isn't anymore. Switch to using x, which then brings things in +line again with the slightly earlier comment there (now) talking about +transitions _from_ writable. + +I have to confess though that I cannot make a direct connection between +the reported observed behavior of guests leaving several pages around +with pending general references and the change here. Repeated testing, +nevertheless, confirms the reported issue is no longer there. + +This is CVE-2022-33745 / XSA-408. + +Reported-by: Charles Arnold <carnold@suse.com> +Fixes: 8cc5036bc385 ("x86/pv: Fix ABAC cmpxchg() race in _get_page_type()") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: a9949efb288fd6e21bbaf9d5826207c7c41cda27 +master date: 2022-07-26 14:54:34 +0200 +--- + xen/arch/x86/mm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 7d0747017db5..c88dc749d431 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2992,7 +2992,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + if ( unlikely(!cpumask_empty(mask)) && + /* Shadow mode: track only writable pages. */ + (!shadow_mode_enabled(d) || +- ((nx & PGT_type_mask) == PGT_writable_page)) ) ++ ((x & PGT_type_mask) == PGT_writable_page)) ) + { + perfc_incr(need_flush_tlb_flush); + /* +-- +2.37.3 + diff --git a/0023-xl-relax-freemem-s-retry-calculation.patch b/0023-xl-relax-freemem-s-retry-calculation.patch new file mode 100644 index 0000000..d7dda30 --- /dev/null +++ b/0023-xl-relax-freemem-s-retry-calculation.patch @@ -0,0 +1,80 @@ +From 2173d9c8be28d5f33c0e299a363ac994867d111b Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 27 Jul 2022 09:28:46 +0200 +Subject: [PATCH 23/67] xl: relax freemem()'s retry calculation + +While in principle possible also under other conditions as long as other +parallel operations potentially consuming memory aren't "locked out", in +particular with IOMMU large page mappings used in Dom0 (for PV when in +strict mode; for PVH when not sharing page tables with HAP) ballooning +out of individual pages can actually lead to less free memory available +afterwards. This is because to split a large page, one or more page +table pages are necessary (one per level that is split). + +When rebooting a guest I've observed freemem() to fail: A single page +was required to be ballooned out (presumably because of heap +fragmentation in the hypervisor). This ballooning out of a single page +of course went fast, but freemem() then found that it would require to +balloon out another page. This repeating just another time leads to the +function to signal failure to the caller - without having come anywhere +near the designated 30s that the whole process is allowed to not make +any progress at all. + +Convert from a simple retry count to actually calculating elapsed time, +subtracting from an initial credit of 30s. Don't go as far as limiting +the "wait_secs" value passed to libxl_wait_for_memory_target(), though. +While this leads to the overall process now possibly taking longer (if +the previous iteration ended very close to the intended 30s), this +compensates to some degree for the value passed really meaning "allowed +to run for this long without making progress". + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: e58370df76eacf1f7ca0340e9b96430c77b41a79 +master date: 2022-07-12 15:25:00 +0200 +--- + tools/xl/xl_vmcontrol.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c +index 435155a03396..5dee7730ca76 100644 +--- a/tools/xl/xl_vmcontrol.c ++++ b/tools/xl/xl_vmcontrol.c +@@ -321,7 +321,8 @@ static int domain_wait_event(uint32_t domid, libxl_event **event_r) + */ + static bool freemem(uint32_t domid, libxl_domain_config *d_config) + { +- int rc, retries = 3; ++ int rc; ++ double credit = 30; + uint64_t need_memkb, free_memkb; + + if (!autoballoon) +@@ -332,6 +333,8 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) + return false; + + do { ++ time_t start; ++ + rc = libxl_get_free_memory(ctx, &free_memkb); + if (rc < 0) + return false; +@@ -345,12 +348,13 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) + + /* wait until dom0 reaches its target, as long as we are making + * progress */ ++ start = time(NULL); + rc = libxl_wait_for_memory_target(ctx, 0, 10); + if (rc < 0) + return false; + +- retries--; +- } while (retries > 0); ++ credit -= difftime(time(NULL), start); ++ } while (credit > 0); + + return false; + } +-- +2.37.3 + diff --git a/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch b/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch new file mode 100644 index 0000000..fbb1448 --- /dev/null +++ b/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch @@ -0,0 +1,59 @@ +From a2684d9cbbfb02b268be7e551674f709db0617a4 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Wed, 27 Jul 2022 09:29:08 +0200 +Subject: [PATCH 24/67] tools/init-xenstore-domain: fix memory map for PVH + stubdom + +In case of maxmem != memsize the E820 map of the PVH stubdom is wrong, +as it is missing the RAM above memsize. + +Additionally the memory map should only specify the Xen special pages +as reserved. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: 134d53f577076d4f26091e25762f27cc3c73bf58 +master date: 2022-07-12 15:25:20 +0200 +--- + tools/helpers/init-xenstore-domain.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c +index 6836002f0bad..32689abd7479 100644 +--- a/tools/helpers/init-xenstore-domain.c ++++ b/tools/helpers/init-xenstore-domain.c +@@ -72,8 +72,9 @@ static int build(xc_interface *xch) + char cmdline[512]; + int rv, xs_fd; + struct xc_dom_image *dom = NULL; +- int limit_kb = (maxmem ? : (memory + 1)) * 1024; ++ int limit_kb = (maxmem ? : memory) * 1024 + X86_HVM_NR_SPECIAL_PAGES * 4; + uint64_t mem_size = MB(memory); ++ uint64_t max_size = MB(maxmem ? : memory); + struct e820entry e820[3]; + struct xen_domctl_createdomain config = { + .ssidref = SECINITSID_DOMU, +@@ -157,13 +158,16 @@ static int build(xc_interface *xch) + dom->mmio_start = LAPIC_BASE_ADDRESS; + dom->max_vcpus = 1; + e820[0].addr = 0; +- e820[0].size = dom->lowmem_end; ++ e820[0].size = (max_size > LAPIC_BASE_ADDRESS) ? ++ LAPIC_BASE_ADDRESS : max_size; + e820[0].type = E820_RAM; +- e820[1].addr = LAPIC_BASE_ADDRESS; +- e820[1].size = dom->mmio_size; ++ e820[1].addr = (X86_HVM_END_SPECIAL_REGION - ++ X86_HVM_NR_SPECIAL_PAGES) << XC_PAGE_SHIFT; ++ e820[1].size = X86_HVM_NR_SPECIAL_PAGES << XC_PAGE_SHIFT; + e820[1].type = E820_RESERVED; + e820[2].addr = GB(4); +- e820[2].size = dom->highmem_end - GB(4); ++ e820[2].size = (max_size > LAPIC_BASE_ADDRESS) ? ++ max_size - LAPIC_BASE_ADDRESS : 0; + e820[2].type = E820_RAM; + } + +-- +2.37.3 + diff --git a/0025-xl-move-freemem-s-credit-expired-loop-exit.patch b/0025-xl-move-freemem-s-credit-expired-loop-exit.patch new file mode 100644 index 0000000..c3a1965 --- /dev/null +++ b/0025-xl-move-freemem-s-credit-expired-loop-exit.patch @@ -0,0 +1,55 @@ +From c37099426ea678c1d5b6c99ae5ad6834f4edd2e6 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 27 Jul 2022 09:29:31 +0200 +Subject: [PATCH 25/67] xl: move freemem()'s "credit expired" loop exit + +Move the "credit expired" loop exit to the middle of the loop, +immediately after "return true". This way having reached the goal on the +last iteration would be reported as success to the caller, rather than +as "timed out". + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: d8f8cb8bdd02fad3b6986ae93511f750fa7f7e6a +master date: 2022-07-18 17:48:18 +0200 +--- + tools/xl/xl_vmcontrol.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c +index 5dee7730ca76..d1c6f8aae67a 100644 +--- a/tools/xl/xl_vmcontrol.c ++++ b/tools/xl/xl_vmcontrol.c +@@ -332,7 +332,7 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) + if (rc < 0) + return false; + +- do { ++ for (;;) { + time_t start; + + rc = libxl_get_free_memory(ctx, &free_memkb); +@@ -342,6 +342,9 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) + if (free_memkb >= need_memkb) + return true; + ++ if (credit <= 0) ++ return false; ++ + rc = libxl_set_memory_target(ctx, 0, free_memkb - need_memkb, 1, 0); + if (rc < 0) + return false; +@@ -354,9 +357,7 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) + return false; + + credit -= difftime(time(NULL), start); +- } while (credit > 0); +- +- return false; ++ } + } + + static void reload_domain_config(uint32_t domid, +-- +2.37.3 + diff --git a/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch b/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch new file mode 100644 index 0000000..fbf3f41 --- /dev/null +++ b/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch @@ -0,0 +1,56 @@ +From 5f1d0179e15d726622a49044a825894d5010df15 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 27 Jul 2022 09:29:54 +0200 +Subject: [PATCH 26/67] x86/spec-ctrl: correct per-guest-type reporting of + MD_CLEAR + +There are command line controls for this and the default also isn't "always +enable when hardware supports it", which logging should take into account. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: fdbf8bdfebc2ed323c521848f642cc4f6b8cb662 +master date: 2022-07-19 08:36:53 +0200 +--- + xen/arch/x86/spec_ctrl.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 563519ce0e31..f7b0251c42bc 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -511,13 +511,12 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + printk(" Support for HVM VMs:%s%s%s%s%s%s\n", + (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || +- boot_cpu_has(X86_FEATURE_MD_CLEAR) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || +- opt_eager_fpu) ? "" : " None", ++ opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", ++ opt_md_clear_hvm ? " MD_CLEAR" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); + + #endif +@@ -525,13 +524,12 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + printk(" Support for PV VMs:%s%s%s%s%s%s\n", + (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || + boot_cpu_has(X86_FEATURE_SC_RSB_PV) || +- boot_cpu_has(X86_FEATURE_MD_CLEAR) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || +- opt_eager_fpu) ? "" : " None", ++ opt_eager_fpu || opt_md_clear_pv) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", ++ opt_md_clear_pv ? " MD_CLEAR" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); + + printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", +-- +2.37.3 + diff --git a/0027-x86-deal-with-gcc12-release-build-issues.patch b/0027-x86-deal-with-gcc12-release-build-issues.patch new file mode 100644 index 0000000..d26f6d3 --- /dev/null +++ b/0027-x86-deal-with-gcc12-release-build-issues.patch @@ -0,0 +1,65 @@ +From a095c6cde8a717325cc31bb393c547cad5e16e35 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 27 Jul 2022 09:30:24 +0200 +Subject: [PATCH 27/67] x86: deal with gcc12 release build issues + +While a number of issues we previously had with pre-release gcc12 were +fixed in the final release, we continue to have one issue (with multiple +instances) when doing release builds (i.e. at higher optimization +levels): The compiler takes issue with subtracting (always 1 in our +case) from artifical labels (expressed as array) marking the end of +certain regions. This isn't an unreasonable position to take. Simply +hide the "array-ness" by casting to an integer type. To keep things +looking consistently, apply the same cast also on the respective +expressions dealing with the starting addresses. (Note how +efi_arch_memory_setup()'s l2_table_offset() invocations avoid a similar +issue by already having the necessary casts.) In is_xen_fixed_mfn() +further switch from __pa() to virt_to_maddr() to better match the left +sides of the <= operators. + +Reported-by: Charles Arnold <carnold@suse.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 9723507daf2120131410c91980d4e4d9b0d0aa90 +master date: 2022-07-19 08:37:29 +0200 +--- + xen/arch/x86/efi/efi-boot.h | 6 +++--- + xen/include/asm-x86/mm.h | 4 ++-- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h +index 2541ba1f320a..84fd77931456 100644 +--- a/xen/arch/x86/efi/efi-boot.h ++++ b/xen/arch/x86/efi/efi-boot.h +@@ -624,10 +624,10 @@ static void __init efi_arch_memory_setup(void) + * appropriate l2 slots to map. + */ + #define l2_4G_offset(a) \ +- (((UINTN)(a) >> L2_PAGETABLE_SHIFT) & (4 * L2_PAGETABLE_ENTRIES - 1)) ++ (((a) >> L2_PAGETABLE_SHIFT) & (4 * L2_PAGETABLE_ENTRIES - 1)) + +- for ( i = l2_4G_offset(_start); +- i <= l2_4G_offset(_end - 1); ++i ) ++ for ( i = l2_4G_offset((UINTN)_start); ++ i <= l2_4G_offset((UINTN)_end - 1); ++i ) + { + l2_pgentry_t pte = l2e_from_paddr(i << L2_PAGETABLE_SHIFT, + __PAGE_HYPERVISOR | _PAGE_PSE); +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 5c19b71eca70..71dd28f126c3 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -309,8 +309,8 @@ struct page_info + #define is_xen_heap_mfn(mfn) \ + (mfn_valid(mfn) && is_xen_heap_page(mfn_to_page(mfn))) + #define is_xen_fixed_mfn(mfn) \ +- (((mfn_to_maddr(mfn)) >= __pa(_stext)) && \ +- ((mfn_to_maddr(mfn)) <= __pa(__2M_rwdata_end - 1))) ++ (((mfn_to_maddr(mfn)) >= virt_to_maddr((unsigned long)_stext)) && \ ++ ((mfn_to_maddr(mfn)) <= virt_to_maddr((unsigned long)__2M_rwdata_end - 1))) + + #define PRtype_info "016lx"/* should only be used for printk's */ + +-- +2.37.3 + diff --git a/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch b/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch new file mode 100644 index 0000000..26b959e --- /dev/null +++ b/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch @@ -0,0 +1,45 @@ +From 4799a202a9017360708c18aa8cd699bd8d6be08b Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 27 Jul 2022 09:31:01 +0200 +Subject: [PATCH 28/67] x86emul: add memory operand low bits checks for + ENQCMD{,S} + +Already ISE rev 044 added text to this effect; rev 045 further dropped +leftover earlier text indicating the contrary: +- ENQCMD requires the low 32 bits of the memory operand to be clear, +- ENDCMDS requires bits 20...30 of the memory operand to be clear. + +Fixes: d27385968741 ("x86emul: support ENQCMD insns") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: d620c66bdbe5510c3bae89be8cc7ca9a2a6cbaba +master date: 2022-07-20 15:46:48 +0200 +--- + xen/arch/x86/x86_emulate/x86_emulate.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c +index 5e297f797187..247c14dc4e68 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -10464,6 +10464,7 @@ x86_emulate( + goto done; + if ( vex.pfx == vex_f2 ) /* enqcmd */ + { ++ generate_exception_if(mmvalp->data32[0], EXC_GP, 0); + fail_if(!ops->read_msr); + if ( (rc = ops->read_msr(MSR_PASID, &msr_val, + ctxt)) != X86EMUL_OKAY ) +@@ -10471,7 +10472,8 @@ x86_emulate( + generate_exception_if(!(msr_val & PASID_VALID), EXC_GP, 0); + mmvalp->data32[0] = MASK_EXTR(msr_val, PASID_PASID_MASK); + } +- mmvalp->data32[0] &= ~0x7ff00000; ++ else ++ generate_exception_if(mmvalp->data32[0] & 0x7ff00000, EXC_GP, 0); + state->blk = blk_enqcmd; + if ( (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags, + state, ctxt)) != X86EMUL_OKAY ) +-- +2.37.3 + diff --git a/0029-x86-also-suppress-use-of-MMX-insns.patch b/0029-x86-also-suppress-use-of-MMX-insns.patch new file mode 100644 index 0000000..1298a47 --- /dev/null +++ b/0029-x86-also-suppress-use-of-MMX-insns.patch @@ -0,0 +1,39 @@ +From 30d3de4c61c297e12662df1fdb89af335947e59d Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 27 Jul 2022 09:31:31 +0200 +Subject: [PATCH 29/67] x86: also suppress use of MMX insns + +Passing -mno-sse alone is not enough: The compiler may still find +(questionable) reasons to use MMX insns. In particular with gcc12 use +of MOVD+PUNPCKLDQ+MOVQ was observed in an apparent attempt to auto- +vectorize the storing of two adjacent zeroes, 32 bits each. + +Reported-by: ChrisD <chris@dalessio.org> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 6fe2e39a0243bddba60f83b77b972a5922d25eb8 +master date: 2022-07-20 15:48:49 +0200 +--- + xen/arch/x86/arch.mk | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/arch.mk b/xen/arch/x86/arch.mk +index 456e5d5c1ad7..c4337a1a118c 100644 +--- a/xen/arch/x86/arch.mk ++++ b/xen/arch/x86/arch.mk +@@ -37,9 +37,9 @@ $(call as-option-add,CFLAGS,CC,\ + + CFLAGS += -mno-red-zone -fpic + +-# Xen doesn't use SSE interally. If the compiler supports it, also skip the +-# SSE setup for variadic function calls. +-CFLAGS += -mno-sse $(call cc-option,$(CC),-mskip-rax-setup) ++# Xen doesn't use MMX or SSE interally. If the compiler supports it, also skip ++# the SSE setup for variadic function calls. ++CFLAGS += -mno-mmx -mno-sse $(call cc-option,$(CC),-mskip-rax-setup) + + # Compile with thunk-extern, indirect-branch-register if avaiable. + CFLAGS-$(CONFIG_INDIRECT_THUNK) += -mindirect-branch=thunk-extern +-- +2.37.3 + diff --git a/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch b/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch new file mode 100644 index 0000000..a9bf845 --- /dev/null +++ b/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch @@ -0,0 +1,52 @@ +From b64f1c9e3e3a2a416c7bb5aab77ba5d2cba98638 Mon Sep 17 00:00:00 2001 +From: Luca Fancellu <luca.fancellu@arm.com> +Date: Wed, 27 Jul 2022 09:31:49 +0200 +Subject: [PATCH 30/67] common/memory: Fix ifdefs for ptdom_max_order + +In common/memory.c the ifdef code surrounding ptdom_max_order is +using HAS_PASSTHROUGH instead of CONFIG_HAS_PASSTHROUGH, fix the +problem using the correct macro. + +Fixes: e0d44c1f9461 ("build: convert HAS_PASSTHROUGH use to Kconfig") +Signed-off-by: Luca Fancellu <luca.fancellu@arm.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 5707470bf3103ebae43697a7ac2faced6cd35f92 +master date: 2022-07-26 08:33:46 +0200 +--- + xen/common/memory.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/common/memory.c b/xen/common/memory.c +index 297b98a562b2..95b2b934e4a2 100644 +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -58,7 +58,7 @@ struct memop_args { + static unsigned int __read_mostly domu_max_order = CONFIG_DOMU_MAX_ORDER; + static unsigned int __read_mostly ctldom_max_order = CONFIG_CTLDOM_MAX_ORDER; + static unsigned int __read_mostly hwdom_max_order = CONFIG_HWDOM_MAX_ORDER; +-#ifdef HAS_PASSTHROUGH ++#ifdef CONFIG_HAS_PASSTHROUGH + static unsigned int __read_mostly ptdom_max_order = CONFIG_PTDOM_MAX_ORDER; + #endif + +@@ -70,7 +70,7 @@ static int __init parse_max_order(const char *s) + ctldom_max_order = simple_strtoul(s, &s, 0); + if ( *s == ',' && *++s != ',' ) + hwdom_max_order = simple_strtoul(s, &s, 0); +-#ifdef HAS_PASSTHROUGH ++#ifdef CONFIG_HAS_PASSTHROUGH + if ( *s == ',' && *++s != ',' ) + ptdom_max_order = simple_strtoul(s, &s, 0); + #endif +@@ -83,7 +83,7 @@ static unsigned int max_order(const struct domain *d) + { + unsigned int order = domu_max_order; + +-#ifdef HAS_PASSTHROUGH ++#ifdef CONFIG_HAS_PASSTHROUGH + if ( cache_flush_permitted(d) && order < ptdom_max_order ) + order = ptdom_max_order; + #endif +-- +2.37.3 + diff --git a/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch b/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch new file mode 100644 index 0000000..a52055a --- /dev/null +++ b/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch @@ -0,0 +1,107 @@ +From 1b9845dcf959421db3a071a6bc0aa9d8edbffb50 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Wed, 3 Aug 2022 12:41:18 +0200 +Subject: [PATCH 31/67] tools/libxl: env variable to signal whether disk/nic + backend is trusted +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce support in libxl for fetching the default backend trusted +option for disk and nic devices. + +Users can set LIBXL_{DISK,NIC}_BACKEND_UNTRUSTED environment variable +to notify libxl of whether the backends for disk and nic devices +should be trusted. Such information is passed into the frontend so it +can take the appropriate measures. + +This is part of XSA-403. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +--- + docs/man/xl.1.pod.in | 18 ++++++++++++++++++ + tools/libs/light/libxl_disk.c | 5 +++++ + tools/libs/light/libxl_nic.c | 7 +++++++ + 3 files changed, 30 insertions(+) + +diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in +index e2176bd696cb..45e1430aeb74 100644 +--- a/docs/man/xl.1.pod.in ++++ b/docs/man/xl.1.pod.in +@@ -1946,6 +1946,24 @@ shows the decimal value. For non-linear mode, it shows hexadecimal value. + + =back + ++=head1 ENVIRONMENT ++ ++=over 4 ++ ++=item B<LIBXL_DISK_BACKEND_UNTRUSTED> ++ ++Set this environment variable to "1" to suggest to the guest that the disk ++backend shouldn't be trusted. If the variable is absent or set to "0", the ++backend will be trusted. ++ ++=item B<LIBXL_NIC_BACKEND_UNTRUSTED> ++ ++Set this environment variable to "1" to suggest to the guest that the network ++backend shouldn't be trusted. If the variable is absent or set to "0", the ++backend will be trusted. ++ ++=back ++ + =head1 IGNORED FOR COMPATIBILITY WITH XM + + xl is mostly command-line compatible with the old xm utility used with +diff --git a/tools/libs/light/libxl_disk.c b/tools/libs/light/libxl_disk.c +index 93936d0dd0f8..67d1cc18578f 100644 +--- a/tools/libs/light/libxl_disk.c ++++ b/tools/libs/light/libxl_disk.c +@@ -246,6 +246,7 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid, + libxl_domain_config d_config; + libxl_device_disk disk_saved; + libxl__flock *lock = NULL; ++ const char *envvar; + + libxl_domain_config_init(&d_config); + libxl_device_disk_init(&disk_saved); +@@ -395,6 +396,10 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid, + flexarray_append(front, GCSPRINTF("%d", device->devid)); + flexarray_append(front, "device-type"); + flexarray_append(front, disk->is_cdrom ? "cdrom" : "disk"); ++ flexarray_append(front, "trusted"); ++ envvar = getenv("LIBXL_DISK_BACKEND_UNTRUSTED"); ++ /* Set "trusted=1" if envvar missing or is "0". */ ++ flexarray_append(front, !envvar || !strcmp("0", envvar) ? "1" : "0"); + + /* + * Old PV kernel disk frontends before 2.6.26 rely on tool stack to +diff --git a/tools/libs/light/libxl_nic.c b/tools/libs/light/libxl_nic.c +index 0b9e70c9d13d..f87890d1d65f 100644 +--- a/tools/libs/light/libxl_nic.c ++++ b/tools/libs/light/libxl_nic.c +@@ -132,6 +132,8 @@ static int libxl__set_xenstore_nic(libxl__gc *gc, uint32_t domid, + flexarray_t *back, flexarray_t *front, + flexarray_t *ro_front) + { ++ const char *envvar; ++ + flexarray_grow(back, 2); + + if (nic->script) +@@ -255,6 +257,11 @@ static int libxl__set_xenstore_nic(libxl__gc *gc, uint32_t domid, + flexarray_append(back, "hotplug-status"); + flexarray_append(back, ""); + ++ flexarray_append(front, "trusted"); ++ envvar = getenv("LIBXL_NIC_BACKEND_UNTRUSTED"); ++ /* Set "trusted=1" if envvar missing or is "0". */ ++ flexarray_append(front, !envvar || !strcmp("0", envvar) ? "1" : "0"); ++ + return 0; + } + +-- +2.37.3 + diff --git a/0032-x86-msr-fix-X2APIC_LAST.patch b/0032-x86-msr-fix-X2APIC_LAST.patch new file mode 100644 index 0000000..ac42842 --- /dev/null +++ b/0032-x86-msr-fix-X2APIC_LAST.patch @@ -0,0 +1,66 @@ +From df3395f6b2d759aba39fb67a7bc0fe49147c8b39 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 3 Aug 2022 12:41:49 +0200 +Subject: [PATCH 32/67] x86/msr: fix X2APIC_LAST +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The latest Intel manual now says the X2APIC reserved range is only +0x800 to 0x8ff (NOT 0xbff). +This changed between SDM 68 (Nov 2018) and SDM 69 (Jan 2019). +The AMD manual documents 0x800-0x8ff too. + +There are non-X2APIC MSRs in the 0x900-0xbff range now: +e.g. 0x981 is IA32_TME_CAPABILITY, an architectural MSR. + +The new MSR in this range appears to have been introduced in Icelake, +so this commit should be backported to Xen versions supporting Icelake. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 13316827faadbb4f72ae6c625af9938d8f976f86 +master date: 2022-07-27 12:57:10 +0200 +--- + xen/arch/x86/hvm/vmx/vmx.c | 4 ++-- + xen/include/asm-x86/msr-index.h | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 868151a2e533..775b36433e24 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -3401,7 +3401,7 @@ void vmx_vlapic_msr_changed(struct vcpu *v) + if ( cpu_has_vmx_apic_reg_virt ) + { + for ( msr = MSR_X2APIC_FIRST; +- msr <= MSR_X2APIC_FIRST + 0xff; msr++ ) ++ msr <= MSR_X2APIC_LAST; msr++ ) + vmx_clear_msr_intercept(v, msr, VMX_MSR_R); + + vmx_set_msr_intercept(v, MSR_X2APIC_PPR, VMX_MSR_R); +@@ -3422,7 +3422,7 @@ void vmx_vlapic_msr_changed(struct vcpu *v) + if ( !(v->arch.hvm.vmx.secondary_exec_control & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE) ) + for ( msr = MSR_X2APIC_FIRST; +- msr <= MSR_X2APIC_FIRST + 0xff; msr++ ) ++ msr <= MSR_X2APIC_LAST; msr++ ) + vmx_set_msr_intercept(v, msr, VMX_MSR_RW); + + vmx_update_secondary_exec_control(v); +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index b4a360723b14..f1b2cf5460c1 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -459,7 +459,7 @@ + #define MSR_IA32_TSC_ADJUST 0x0000003b + + #define MSR_X2APIC_FIRST 0x00000800 +-#define MSR_X2APIC_LAST 0x00000bff ++#define MSR_X2APIC_LAST 0x000008ff + + #define MSR_X2APIC_TPR 0x00000808 + #define MSR_X2APIC_PPR 0x0000080a +-- +2.37.3 + diff --git a/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch b/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch new file mode 100644 index 0000000..46780c4 --- /dev/null +++ b/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch @@ -0,0 +1,54 @@ +From 8ae0b4d1331c14fb9e30a42987c0152c9b00f530 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 15 Aug 2022 15:40:05 +0200 +Subject: [PATCH 33/67] x86/spec-ctrl: Use IST RSB protection for !SVM systems + +There is a corner case where a VT-x guest which manages to reliably trigger +non-fatal #MC's could evade the rogue RSB speculation protections that were +supposed to be in place. + +This is a lack of defence in depth; Xen does not architecturally execute more +RET than CALL instructions, so an attacker would have to locate a different +gadget (e.g. SpectreRSB) first to execute a transient path of excess RET +instructions. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: e570e8d520ab542d8d35666b95cb3a0125b7b110 +master date: 2022-08-05 12:16:24 +0100 +--- + xen/arch/x86/spec_ctrl.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index f7b0251c42bc..ac73806eacd8 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -1279,8 +1279,24 @@ void __init init_speculation_mitigations(void) + * mappings. + */ + if ( opt_rsb_hvm ) ++ { + setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM); + ++ /* ++ * For SVM, Xen's RSB safety actions are performed before STGI, so ++ * behave atomically with respect to IST sources. ++ * ++ * For VT-x, NMIs are atomic with VMExit (the NMI gets queued but not ++ * delivered) whereas other IST sources are not atomic. Specifically, ++ * #MC can hit ahead the RSB safety action in the vmexit path. ++ * ++ * Therefore, it is necessary for the IST logic to protect Xen against ++ * possible rogue RSB speculation. ++ */ ++ if ( !cpu_has_svm ) ++ default_spec_ctrl_flags |= SCF_ist_rsb; ++ } ++ + ibpb_calculations(); + + /* Check whether Eager FPU should be enabled by default. */ +-- +2.37.3 + diff --git a/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch b/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch new file mode 100644 index 0000000..6a73c21 --- /dev/null +++ b/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch @@ -0,0 +1,68 @@ +From 5efcae1eb30ff24e100954e00889a568c1745ea1 Mon Sep 17 00:00:00 2001 +From: Jason Andryuk <jandryuk@gmail.com> +Date: Mon, 15 Aug 2022 15:40:47 +0200 +Subject: [PATCH 34/67] x86: Expose more MSR_ARCH_CAPS to hwdom + +commit e46474278a0e ("x86/intel: Expose MSR_ARCH_CAPS to dom0") started +exposing MSR_ARCH_CAPS to dom0. More bits in MSR_ARCH_CAPS have since +been defined, but they haven't been exposed. Update the list to allow +them through. + +As one example, this allows a Linux Dom0 to know that it has the +appropriate microcode via FB_CLEAR. Notably, and with the updated +microcode, this changes dom0's +/sys/devices/system/cpu/vulnerabilities/mmio_stale_data changes from: + + "Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown" + +to: + + "Mitigation: Clear CPU buffers; SMT Host state unknown" + +This exposes the MMIO Stale Data and Intel Branch History Injection +(BHI) controls as well as the page size change MCE issue bit. + +Fixes: commit 2ebe8fe9b7e0 ("x86/spec-ctrl: Enumeration for MMIO Stale Data controls") +Fixes: commit cea9ae062295 ("x86/spec-ctrl: Enumeration for new Intel BHI controls") +Fixes: commit 59e89cdabc71 ("x86/vtx: Disable executable EPT superpages to work around CVE-2018-12207") +Signed-off-by: Jason Andryuk <jandryuk@gmail.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: e83cd54611fec5b7a539fa1281a14319143490e6 +master date: 2022-08-09 16:35:25 +0100 +--- + xen/arch/x86/msr.c | 5 ++++- + xen/include/asm-x86/msr-index.h | 2 ++ + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index 0739d00e74f1..aa9face9aad3 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -145,7 +145,10 @@ int init_domain_msr_policy(struct domain *d) + + mp->arch_caps.raw = val & + (ARCH_CAPS_RDCL_NO | ARCH_CAPS_IBRS_ALL | ARCH_CAPS_RSBA | +- ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO); ++ ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO | ++ ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO | ++ ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA | ++ ARCH_CAPS_BHI_NO); + } + + d->arch.msr = mp; +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index f1b2cf5460c1..49ca1f1845e6 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -64,6 +64,8 @@ + #define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15) + #define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17) + #define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) ++#define ARCH_CAPS_RRSBA (_AC(1, ULL) << 19) ++#define ARCH_CAPS_BHI_NO (_AC(1, ULL) << 20) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +-- +2.37.3 + diff --git a/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch b/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch new file mode 100644 index 0000000..0dfb3b4 --- /dev/null +++ b/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch @@ -0,0 +1,123 @@ +From 1e31848cdd8d2ff3cb76f364f04f9771f9b3a8b1 Mon Sep 17 00:00:00 2001 +From: Dario Faggioli <dfaggioli@suse.com> +Date: Mon, 15 Aug 2022 15:41:25 +0200 +Subject: [PATCH 35/67] xen/sched: setup dom0 vCPUs affinity only once + +Right now, affinity for dom0 vCPUs is setup in two steps. This is a +problem as, at least in Credit2, unit_insert() sees and uses the +"intermediate" affinity, and place the vCPUs on CPUs where they cannot +be run. And this in turn results in boot hangs, if the "dom0_nodes" +parameter is used. + +Fix this by setting up the affinity properly once and for all, in +sched_init_vcpu() called by create_vcpu(). + +Note that, unless a soft-affinity is explicitly specified for dom0 (by +using the relaxed mode of "dom0_nodes") we set it to the default, which +is all CPUs, instead of computing it basing on hard affinity (if any). +This is because hard and soft affinity should be considered as +independent user controlled properties. In fact, if we dor derive dom0's +soft-affinity from its boot-time hard-affinity, such computed value will +continue to be used even if later the user changes the hard-affinity. +And this could result in the vCPUs behaving differently than what the +user wanted and expects. + +Fixes: dafd936dddbd ("Make credit2 the default scheduler") +Reported-by: Olaf Hering <ohering@suse.de> +Signed-off-by: Dario Faggioli <dfaggioli@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: c79e4d209be3ed2a6b8e97c35944786ed2a66b94 +master date: 2022-08-11 11:46:22 +0200 +--- + xen/common/sched/core.c | 63 +++++++++++++++++++++++++---------------- + 1 file changed, 39 insertions(+), 24 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 8f4b1ca10d1c..f07bd2681fcb 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -571,12 +571,46 @@ int sched_init_vcpu(struct vcpu *v) + return 1; + } + +- /* +- * Initialize affinity settings. The idler, and potentially +- * domain-0 VCPUs, are pinned onto their respective physical CPUs. +- */ +- if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) ) ++ if ( is_idle_domain(d) ) ++ { ++ /* Idle vCPUs are always pinned onto their respective pCPUs */ + sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); ++ } ++ else if ( pv_shim && v->vcpu_id == 0 ) ++ { ++ /* ++ * PV-shim: vcpus are pinned 1:1. Initially only 1 cpu is online, ++ * others will be dealt with when onlining them. This avoids pinning ++ * a vcpu to a not yet online cpu here. ++ */ ++ sched_set_affinity(unit, cpumask_of(0), cpumask_of(0)); ++ } ++ else if ( is_hardware_domain(d) && opt_dom0_vcpus_pin ) ++ { ++ /* ++ * If dom0_vcpus_pin is specified, dom0 vCPUs are pinned 1:1 to ++ * their respective pCPUs too. ++ */ ++ sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); ++ } ++#ifdef CONFIG_X86 ++ else if ( d->domain_id == 0 ) ++ { ++ /* ++ * In absence of dom0_vcpus_pin instead, the hard and soft affinity of ++ * dom0 is controlled by the (x86 only) dom0_nodes parameter. At this ++ * point it has been parsed and decoded into the dom0_cpus mask. ++ * ++ * Note that we always honor what user explicitly requested, for both ++ * hard and soft affinity, without doing any dynamic computation of ++ * either of them. ++ */ ++ if ( !dom0_affinity_relaxed ) ++ sched_set_affinity(unit, &dom0_cpus, &cpumask_all); ++ else ++ sched_set_affinity(unit, &cpumask_all, &dom0_cpus); ++ } ++#endif + else + sched_set_affinity(unit, &cpumask_all, &cpumask_all); + +@@ -3386,29 +3420,10 @@ void wait(void) + void __init sched_setup_dom0_vcpus(struct domain *d) + { + unsigned int i; +- struct sched_unit *unit; + + for ( i = 1; i < d->max_vcpus; i++ ) + vcpu_create(d, i); + +- /* +- * PV-shim: vcpus are pinned 1:1. +- * Initially only 1 cpu is online, others will be dealt with when +- * onlining them. This avoids pinning a vcpu to a not yet online cpu here. +- */ +- if ( pv_shim ) +- sched_set_affinity(d->vcpu[0]->sched_unit, +- cpumask_of(0), cpumask_of(0)); +- else +- { +- for_each_sched_unit ( d, unit ) +- { +- if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed ) +- sched_set_affinity(unit, &dom0_cpus, NULL); +- sched_set_affinity(unit, NULL, &dom0_cpus); +- } +- } +- + domain_update_node_affinity(d); + } + #endif +-- +2.37.3 + diff --git a/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch b/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch new file mode 100644 index 0000000..1637236 --- /dev/null +++ b/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch @@ -0,0 +1,38 @@ +From c373ad3d084614a93c55e25dc20e70ffc7574971 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Mon, 15 Aug 2022 15:42:09 +0200 +Subject: [PATCH 36/67] tools/libxl: Replace deprecated -sdl option on QEMU + command line + +"-sdl" is deprecated upstream since 6695e4c0fd9e ("softmmu/vl: +Deprecate the -sdl and -curses option"), QEMU v6.2, and the option is +removed by 707d93d4abc6 ("ui: Remove deprecated options "-sdl" and +"-curses""), in upcoming QEMU v7.1. + +Instead, use "-display sdl", available since 1472a95bab1e ("Introduce +-display argument"), before QEMU v1.0. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Reviewed-by: Jason Andryuk <jandryuk@gmail.com> +master commit: 41fcb3af8ad6d4c9f65a9d72798e6d18afec55ac +master date: 2022-08-11 11:47:11 +0200 +--- + tools/libs/light/libxl_dm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index 24f6e73b0a77..ae5f35e0c3fd 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -1349,7 +1349,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + flexarray_append_pair(dm_args, "-display", "none"); + + if (sdl && !is_stubdom) { +- flexarray_append(dm_args, "-sdl"); ++ flexarray_append_pair(dm_args, "-display", "sdl"); + if (sdl->display) + flexarray_append_pair(dm_envs, "DISPLAY", sdl->display); + if (sdl->xauthority) +-- +2.37.3 + diff --git a/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch b/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch new file mode 100644 index 0000000..d27766b --- /dev/null +++ b/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch @@ -0,0 +1,67 @@ +From fba0c22e79922085c46527eb1391123aadfb24d1 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 15 Aug 2022 15:42:31 +0200 +Subject: [PATCH 37/67] x86/spec-ctrl: Enumeration for PBRSB_NO + +The PBRSB_NO bit indicates that the CPU is not vulnerable to the Post-Barrier +RSB speculative vulnerability. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: b874e47eb13feb75be3ee7b5dc4ae9c97d80d774 +master date: 2022-08-11 16:19:50 +0100 +--- + xen/arch/x86/msr.c | 2 +- + xen/arch/x86/spec_ctrl.c | 3 ++- + xen/include/asm-x86/msr-index.h | 1 + + 3 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index aa9face9aad3..9bced8d36caa 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -148,7 +148,7 @@ int init_domain_msr_policy(struct domain *d) + ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO | + ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO | + ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA | +- ARCH_CAPS_BHI_NO); ++ ARCH_CAPS_BHI_NO | ARCH_CAPS_PBRSB_NO); + } + + d->arch.msr = mp; +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index ac73806eacd8..3ff602bd0281 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -419,7 +419,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + * Hardware read-only information, stating immunity to certain issues, or + * suggestions of which mitigation to use. + */ +- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", +@@ -431,6 +431,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "", + (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "", + (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "", ++ (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 49ca1f1845e6..5a830f76a8d4 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -66,6 +66,7 @@ + #define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) + #define ARCH_CAPS_RRSBA (_AC(1, ULL) << 19) + #define ARCH_CAPS_BHI_NO (_AC(1, ULL) << 20) ++#define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +-- +2.37.3 + diff --git a/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch b/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch new file mode 100644 index 0000000..e0e0f87 --- /dev/null +++ b/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch @@ -0,0 +1,33 @@ +From 104a54a307b08945365faf6d285cd5a02f94a80f Mon Sep 17 00:00:00 2001 +From: Ross Lagerwall <ross.lagerwall@citrix.com> +Date: Mon, 15 Aug 2022 15:43:08 +0200 +Subject: [PATCH 38/67] x86/amd: only call setup_force_cpu_cap for boot CPU + +This should only be called for the boot CPU to avoid calling _init code +after it has been unloaded. + +Fixes: 062868a5a8b4 ("x86/amd: Work around CLFLUSH ordering on older parts") +Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 31b41ce858c8bd5159212d40969f8e0b7124bbf0 +master date: 2022-08-11 17:44:26 +0200 +--- + xen/arch/x86/cpu/amd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index 60dbe61a61ca..a8d2fb8a1590 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -820,7 +820,7 @@ static void init_amd(struct cpuinfo_x86 *c) + * everything, including reads and writes to address, and + * LFENCE/SFENCE instructions. + */ +- if (!cpu_has_clflushopt) ++ if (c == &boot_cpu_data && !cpu_has_clflushopt) + setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); + + switch(c->x86) +-- +2.37.3 + diff --git a/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch b/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch new file mode 100644 index 0000000..50d83b6 --- /dev/null +++ b/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch @@ -0,0 +1,38 @@ +From a075900cf768fe45f270b6f1d09c4e99281da142 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 15 Aug 2022 15:43:56 +0200 +Subject: [PATCH 39/67] build/x86: suppress GNU ld 2.39 warning about RWX load + segments + +Commit 68f5aac012b9 ("build: suppress future GNU ld warning about RWX +load segments") didn't quite cover all the cases: Apparently I missed +ones in the building of 32-bit helper objects because of only looking at +incremental builds (where those wouldn't normally be re-built). Clone +the workaround there to the specific Makefile in question. + +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 3eb1865ae305772b558757904d81951e31de43de +master date: 2022-08-11 17:45:12 +0200 +--- + xen/arch/x86/boot/build32.mk | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xen/arch/x86/boot/build32.mk b/xen/arch/x86/boot/build32.mk +index e90680cd9f52..d2fae5cf9eee 100644 +--- a/xen/arch/x86/boot/build32.mk ++++ b/xen/arch/x86/boot/build32.mk +@@ -8,6 +8,9 @@ CFLAGS += -Werror -fno-builtin -g0 -msoft-float + CFLAGS += -I$(BASEDIR)/include + CFLAGS := $(filter-out -flto,$(CFLAGS)) + ++LDFLAGS_DIRECT-$(shell $(LD) -v --warn-rwx-segments >/dev/null 2>&1 && echo y) := --no-warn-rwx-segments ++LDFLAGS_DIRECT += $(LDFLAGS_DIRECT-y) ++ + # NB. awk invocation is a portable alternative to 'head -n -1' + %.S: %.bin + (od -v -t x $< | tr -s ' ' | awk 'NR > 1 {print s} {s=$$0}' | \ +-- +2.37.3 + diff --git a/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch b/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch new file mode 100644 index 0000000..c29e5ac --- /dev/null +++ b/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch @@ -0,0 +1,153 @@ +From 9acedc3c58c31930737edbe212f2ccf437a0b757 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 15 Aug 2022 15:44:23 +0200 +Subject: [PATCH 40/67] PCI: simplify (and thus correct) + pci_get_pdev{,_by_domain}() + +The last "wildcard" use of either function went away with f591755823a7 +("IOMMU/PCI: don't let domain cleanup continue when device de-assignment +failed"). Don't allow them to be called this way anymore. Besides +simplifying the code this also fixes two bugs: + +1) When seg != -1, the outer loops should have been terminated after the + first iteration, or else a device with the same BDF but on another + segment could be found / returned. + +Reported-by: Rahul Singh <rahul.singh@arm.com> + +2) When seg == -1 calling get_pseg() is bogus. The function (taking a + u16) would look for segment 0xffff, which might exist. If it exists, + we might then find / return a wrong device. + +In pci_get_pdev_by_domain() also switch from using the per-segment list +to using the per-domain one, with the exception of the hardware domain +(see the code comment there). + +While there also constify "pseg" and drop "pdev"'s already previously +unnecessary initializer. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Rahul Singh <rahul.singh@arm.com> +Tested-by: Rahul Singh <rahul.singh@arm.com> +master commit: 8cf6e0738906fc269af40135ed82a07815dd3b9c +master date: 2022-08-12 08:34:33 +0200 +--- + xen/drivers/passthrough/pci.c | 61 +++++++++++++++-------------------- + xen/include/xen/pci.h | 6 ++-- + 2 files changed, 29 insertions(+), 38 deletions(-) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index bbacbe41dac4..9b81b941c8bb 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -528,30 +528,19 @@ int __init pci_ro_device(int seg, int bus, int devfn) + return 0; + } + +-struct pci_dev *pci_get_pdev(int seg, int bus, int devfn) ++struct pci_dev *pci_get_pdev(uint16_t seg, uint8_t bus, uint8_t devfn) + { +- struct pci_seg *pseg = get_pseg(seg); +- struct pci_dev *pdev = NULL; ++ const struct pci_seg *pseg = get_pseg(seg); ++ struct pci_dev *pdev; + + ASSERT(pcidevs_locked()); +- ASSERT(seg != -1 || bus == -1); +- ASSERT(bus != -1 || devfn == -1); + + if ( !pseg ) +- { +- if ( seg == -1 ) +- radix_tree_gang_lookup(&pci_segments, (void **)&pseg, 0, 1); +- if ( !pseg ) +- return NULL; +- } ++ return NULL; + +- do { +- list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) +- if ( (pdev->bus == bus || bus == -1) && +- (pdev->devfn == devfn || devfn == -1) ) +- return pdev; +- } while ( radix_tree_gang_lookup(&pci_segments, (void **)&pseg, +- pseg->nr + 1, 1) ); ++ list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) ++ if ( pdev->bus == bus && pdev->devfn == devfn ) ++ return pdev; + + return NULL; + } +@@ -577,31 +566,33 @@ struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn) + return pdev; + } + +-struct pci_dev *pci_get_pdev_by_domain(const struct domain *d, int seg, +- int bus, int devfn) ++struct pci_dev *pci_get_pdev_by_domain(const struct domain *d, uint16_t seg, ++ uint8_t bus, uint8_t devfn) + { +- struct pci_seg *pseg = get_pseg(seg); +- struct pci_dev *pdev = NULL; ++ struct pci_dev *pdev; + +- ASSERT(seg != -1 || bus == -1); +- ASSERT(bus != -1 || devfn == -1); +- +- if ( !pseg ) ++ /* ++ * The hardware domain owns the majority of the devices in the system. ++ * When there are multiple segments, traversing the per-segment list is ++ * likely going to be faster, whereas for a single segment the difference ++ * shouldn't be that large. ++ */ ++ if ( is_hardware_domain(d) ) + { +- if ( seg == -1 ) +- radix_tree_gang_lookup(&pci_segments, (void **)&pseg, 0, 1); ++ const struct pci_seg *pseg = get_pseg(seg); ++ + if ( !pseg ) + return NULL; +- } + +- do { + list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) +- if ( (pdev->bus == bus || bus == -1) && +- (pdev->devfn == devfn || devfn == -1) && +- (pdev->domain == d) ) ++ if ( pdev->bus == bus && pdev->devfn == devfn && ++ pdev->domain == d ) ++ return pdev; ++ } ++ else ++ list_for_each_entry ( pdev, &d->pdev_list, domain_list ) ++ if ( pdev->bus == bus && pdev->devfn == devfn ) + return pdev; +- } while ( radix_tree_gang_lookup(&pci_segments, (void **)&pseg, +- pseg->nr + 1, 1) ); + + return NULL; + } +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index 8e3d4d94543a..cd238ae852b0 100644 +--- a/xen/include/xen/pci.h ++++ b/xen/include/xen/pci.h +@@ -166,10 +166,10 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn, + int pci_remove_device(u16 seg, u8 bus, u8 devfn); + int pci_ro_device(int seg, int bus, int devfn); + int pci_hide_device(unsigned int seg, unsigned int bus, unsigned int devfn); +-struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); ++struct pci_dev *pci_get_pdev(uint16_t seg, uint8_t bus, uint8_t devfn); + struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn); +-struct pci_dev *pci_get_pdev_by_domain(const struct domain *, int seg, +- int bus, int devfn); ++struct pci_dev *pci_get_pdev_by_domain(const struct domain *, uint16_t seg, ++ uint8_t bus, uint8_t devfn); + void pci_check_disable_device(u16 seg, u8 bus, u8 devfn); + + uint8_t pci_conf_read8(pci_sbdf_t sbdf, unsigned int reg); +-- +2.37.3 + diff --git a/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch b/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch new file mode 100644 index 0000000..3fa0e43 --- /dev/null +++ b/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch @@ -0,0 +1,62 @@ +From 09fc590c15773c2471946a78740c6b02e8c34a45 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 11 Oct 2022 15:05:53 +0200 +Subject: [PATCH 41/67] xen/arm: p2m: Prevent adding mapping when domain is + dying + +During the domain destroy process, the domain will still be accessible +until it is fully destroyed. So does the P2M because we don't bail +out early if is_dying is non-zero. If a domain has permission to +modify the other domain's P2M (i.e. dom0, or a stubdomain), then +foreign mapping can be added past relinquish_p2m_mapping(). + +Therefore, we need to prevent mapping to be added when the domain +is dying. This commit prevents such adding of mapping by adding the +d->is_dying check to p2m_set_entry(). Also this commit enhances the +check in relinquish_p2m_mapping() to make sure that no mappings can +be added in the P2M after the P2M lock is released. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Tested-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 3ebe773293e3b945460a3d6f54f3b91915397bab +master date: 2022-10-11 14:20:18 +0200 +--- + xen/arch/arm/p2m.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 2ddd06801a82..8398251c518b 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m, + { + int rc = 0; + ++ /* ++ * Any reference taken by the P2M mappings (e.g. foreign mapping) will ++ * be dropped in relinquish_p2m_mapping(). As the P2M will still ++ * be accessible after, we need to prevent mapping to be added when the ++ * domain is dying. ++ */ ++ if ( unlikely(p2m->domain->is_dying) ) ++ return -ENOMEM; ++ + while ( nr ) + { + unsigned long mask; +@@ -1613,6 +1622,8 @@ int relinquish_p2m_mapping(struct domain *d) + unsigned int order; + gfn_t start, end; + ++ BUG_ON(!d->is_dying); ++ /* No mappings can be added in the P2M after the P2M lock is released. */ + p2m_write_lock(p2m); + + start = p2m->lowest_mapped_gfn; +-- +2.37.3 + diff --git a/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch b/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch new file mode 100644 index 0000000..8217a06 --- /dev/null +++ b/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch @@ -0,0 +1,167 @@ +From 0d805f9fba4bc155d15047685024f7d842e925e4 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 11 Oct 2022 15:06:36 +0200 +Subject: [PATCH 42/67] xen/arm: p2m: Handle preemption when freeing + intermediate page tables + +At the moment the P2M page tables will be freed when the domain structure +is freed without any preemption. As the P2M is quite large, iterating +through this may take more time than it is reasonable without intermediate +preemption (to run softirqs and perhaps scheduler). + +Split p2m_teardown() in two parts: one preemptible and called when +relinquishing the resources, the other one non-preemptible and called +when freeing the domain structure. + +As we are now freeing the P2M pages early, we also need to prevent +further allocation if someone call p2m_set_entry() past p2m_teardown() +(I wasn't able to prove this will never happen). This is done by +the checking domain->is_dying from previous patch in p2m_set_entry(). + +Similarly, we want to make sure that no-one can accessed the free +pages. Therefore the root is cleared before freeing pages. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Tested-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 3202084566bba0ef0c45caf8c24302f83d92f9c8 +master date: 2022-10-11 14:20:56 +0200 +--- + xen/arch/arm/domain.c | 10 +++++++-- + xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++--- + xen/include/asm-arm/p2m.h | 13 +++++++++-- + 3 files changed, 63 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index 5eaf4c718ec3..223ec9694df1 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -779,10 +779,10 @@ fail: + void arch_domain_destroy(struct domain *d) + { + /* IOMMU page table is shared with P2M, always call +- * iommu_domain_destroy() before p2m_teardown(). ++ * iommu_domain_destroy() before p2m_final_teardown(). + */ + iommu_domain_destroy(d); +- p2m_teardown(d); ++ p2m_final_teardown(d); + domain_vgic_free(d); + domain_vuart_free(d); + free_xenheap_page(d->shared_info); +@@ -984,6 +984,7 @@ enum { + PROG_xen, + PROG_page, + PROG_mapping, ++ PROG_p2m, + PROG_done, + }; + +@@ -1038,6 +1039,11 @@ int domain_relinquish_resources(struct domain *d) + if ( ret ) + return ret; + ++ PROGRESS(p2m): ++ ret = p2m_teardown(d); ++ if ( ret ) ++ return ret; ++ + PROGRESS(done): + break; + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 8398251c518b..4ad3e0606e9c 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1530,17 +1530,58 @@ static void p2m_free_vmid(struct domain *d) + spin_unlock(&vmid_alloc_lock); + } + +-void p2m_teardown(struct domain *d) ++int p2m_teardown(struct domain *d) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); ++ unsigned long count = 0; + struct page_info *pg; ++ unsigned int i; ++ int rc = 0; ++ ++ p2m_write_lock(p2m); ++ ++ /* ++ * We are about to free the intermediate page-tables, so clear the ++ * root to prevent any walk to use them. ++ */ ++ for ( i = 0; i < P2M_ROOT_PAGES; i++ ) ++ clear_and_clean_page(p2m->root + i); ++ ++ /* ++ * The domain will not be scheduled anymore, so in theory we should ++ * not need to flush the TLBs. Do it for safety purpose. ++ * ++ * Note that all the devices have already been de-assigned. So we don't ++ * need to flush the IOMMU TLB here. ++ */ ++ p2m_force_tlb_flush_sync(p2m); ++ ++ while ( (pg = page_list_remove_head(&p2m->pages)) ) ++ { ++ free_domheap_page(pg); ++ count++; ++ /* Arbitrarily preempt every 512 iterations */ ++ if ( !(count % 512) && hypercall_preempt_check() ) ++ { ++ rc = -ERESTART; ++ break; ++ } ++ } ++ ++ p2m_write_unlock(p2m); ++ ++ return rc; ++} ++ ++void p2m_final_teardown(struct domain *d) ++{ ++ struct p2m_domain *p2m = p2m_get_hostp2m(d); + + /* p2m not actually initialized */ + if ( !p2m->domain ) + return; + +- while ( (pg = page_list_remove_head(&p2m->pages)) ) +- free_domheap_page(pg); ++ ASSERT(page_list_empty(&p2m->pages)); + + if ( p2m->root ) + free_domheap_pages(p2m->root, P2M_ROOT_ORDER); +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index 6a2108398fd7..3a2d51b35d71 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -192,8 +192,17 @@ void setup_virt_paging(void); + /* Init the datastructures for later use by the p2m code */ + int p2m_init(struct domain *d); + +-/* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct domain *d); ++/* ++ * The P2M resources are freed in two parts: ++ * - p2m_teardown() will be called when relinquish the resources. It ++ * will free large resources (e.g. intermediate page-tables) that ++ * requires preemption. ++ * - p2m_final_teardown() will be called when domain struct is been ++ * freed. This *cannot* be preempted and therefore one small ++ * resources should be freed here. ++ */ ++int p2m_teardown(struct domain *d); ++void p2m_final_teardown(struct domain *d); + + /* + * Remove mapping refcount on each mapping page in the p2m +-- +2.37.3 + diff --git a/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch b/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch new file mode 100644 index 0000000..f3f7e3a --- /dev/null +++ b/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch @@ -0,0 +1,138 @@ +From 0f3eab90f327210d91e8e31a769376f286e8819a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 15:07:25 +0200 +Subject: [PATCH 43/67] x86/p2m: add option to skip root pagetable removal in + p2m_teardown() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add a new parameter to p2m_teardown() in order to select whether the +root page table should also be freed. Note that all users are +adjusted to pass the parameter to remove the root page tables, so +behavior is not modified. + +No functional change intended. + +This is part of CVE-2022-33746 / XSA-410. + +Suggested-by: Julien Grall <julien@xen.org> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: 1df52a270225527ae27bfa2fc40347bf93b78357 +master date: 2022-10-11 14:21:23 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 6 +++--- + xen/arch/x86/mm/p2m.c | 20 ++++++++++++++++---- + xen/arch/x86/mm/shadow/common.c | 4 ++-- + xen/include/asm-x86/p2m.h | 2 +- + 4 files changed, 22 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index 47a7487fa7a3..a8f5a19da917 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d) + } + + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i]); ++ p2m_teardown(d->arch.altp2m_p2m[i], true); + } + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +- p2m_teardown(d->arch.nested_p2m[i]); ++ p2m_teardown(d->arch.nested_p2m[i], true); + } + + if ( d->arch.paging.hap.total_pages != 0 ) + hap_teardown(d, NULL); + +- p2m_teardown(p2m_get_hostp2m(d)); ++ p2m_teardown(p2m_get_hostp2m(d), true); + /* Free any memory that the p2m teardown released */ + paging_lock(d); + hap_set_allocation(d, 0, NULL); +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index 85681dee2623..8ba73082c1bf 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -741,11 +741,11 @@ int p2m_alloc_table(struct p2m_domain *p2m) + * hvm fixme: when adding support for pvh non-hardware domains, this path must + * cleanup any foreign p2m types (release refcnts on them). + */ +-void p2m_teardown(struct p2m_domain *p2m) ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root) + /* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ + { +- struct page_info *pg; ++ struct page_info *pg, *root_pg = NULL; + struct domain *d; + + if (p2m == NULL) +@@ -755,10 +755,22 @@ void p2m_teardown(struct p2m_domain *p2m) + + p2m_lock(p2m); + ASSERT(atomic_read(&d->shr_pages) == 0); +- p2m->phys_table = pagetable_null(); ++ ++ if ( remove_root ) ++ p2m->phys_table = pagetable_null(); ++ else if ( !pagetable_is_null(p2m->phys_table) ) ++ { ++ root_pg = pagetable_get_page(p2m->phys_table); ++ clear_domain_page(pagetable_get_mfn(p2m->phys_table)); ++ } + + while ( (pg = page_list_remove_head(&p2m->pages)) ) +- d->arch.paging.free_page(d, pg); ++ if ( pg != root_pg ) ++ d->arch.paging.free_page(d, pg); ++ ++ if ( root_pg ) ++ page_list_add(root_pg, &p2m->pages); ++ + p2m_unlock(p2m); + } + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 4a8882430b3f..abe6d4334382 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2768,7 +2768,7 @@ int shadow_enable(struct domain *d, u32 mode) + paging_unlock(d); + out_unlocked: + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) +- p2m_teardown(p2m); ++ p2m_teardown(p2m, true); + if ( rv != 0 && pg != NULL ) + { + pg->count_info &= ~PGC_count_mask; +@@ -2933,7 +2933,7 @@ void shadow_final_teardown(struct domain *d) + shadow_teardown(d, NULL); + + /* It is now safe to pull down the p2m map. */ +- p2m_teardown(p2m_get_hostp2m(d)); ++ p2m_teardown(p2m_get_hostp2m(d), true); + /* Free any shadow memory that the p2m teardown released */ + paging_lock(d); + shadow_set_allocation(d, 0, NULL); +diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h +index 46e8b94a49df..46eb51d44cf5 100644 +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -619,7 +619,7 @@ int p2m_init(struct domain *d); + int p2m_alloc_table(struct p2m_domain *p2m); + + /* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct p2m_domain *p2m); ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root); + void p2m_final_teardown(struct domain *d); + + /* Add a page to a domain's p2m table */ +-- +2.37.3 + diff --git a/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch b/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch new file mode 100644 index 0000000..39db626 --- /dev/null +++ b/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch @@ -0,0 +1,77 @@ +From d24a10a91d46a56e1d406239643ec651a31033d4 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:07:42 +0200 +Subject: [PATCH 44/67] x86/HAP: adjust monitor table related error handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +hap_make_monitor_table() will return INVALID_MFN if it encounters an +error condition, but hap_update_paging_modes() wasn’t handling this +value, resulting in an inappropriate value being stored in +monitor_table. This would subsequently misguide at least +hap_vcpu_teardown(). Avoid this by bailing early. + +Further, when a domain has/was already crashed or (perhaps less +important as there's no such path known to lead here) is already dying, +avoid calling domain_crash() on it again - that's at best confusing. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 5b44a61180f4f2e4f490a28400c884dd357ff45d +master date: 2022-10-11 14:21:56 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index a8f5a19da917..d75dc2b9ed3d 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -39,6 +39,7 @@ + #include <asm/domain.h> + #include <xen/numa.h> + #include <asm/hvm/nestedhvm.h> ++#include <public/sched.h> + + #include "private.h" + +@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(struct vcpu *v) + return m4mfn; + + oom: +- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n"); +- domain_crash(d); ++ if ( !d->is_dying && ++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ { ++ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n", ++ d); ++ domain_crash(d); ++ } + return INVALID_MFN; + } + +@@ -766,6 +772,9 @@ static void hap_update_paging_modes(struct vcpu *v) + if ( pagetable_is_null(v->arch.hvm.monitor_table) ) + { + mfn_t mmfn = hap_make_monitor_table(v); ++ ++ if ( mfn_eq(mmfn, INVALID_MFN) ) ++ goto unlock; + v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); + make_cr3(v, mmfn); + hvm_update_host_cr3(v); +@@ -774,6 +783,7 @@ static void hap_update_paging_modes(struct vcpu *v) + /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ + hap_update_cr3(v, 0, false); + ++ unlock: + paging_unlock(d); + put_gfn(d, cr3_gfn); + } +-- +2.37.3 + diff --git a/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch b/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch new file mode 100644 index 0000000..7cf356d --- /dev/null +++ b/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch @@ -0,0 +1,76 @@ +From 95f6d555ec84383f7daaf3374f65bec5ff4351f5 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:07:57 +0200 +Subject: [PATCH 45/67] x86/shadow: tolerate failure of + sh_set_toplevel_shadow() + +Subsequently sh_set_toplevel_shadow() will be adjusted to install a +blank entry in case prealloc fails. There are, in fact, pre-existing +error paths which would put in place a blank entry. The 4- and 2-level +code in sh_update_cr3(), however, assume the top level entry to be +valid. + +Hence bail from the function in the unlikely event that it's not. Note +that 3-level logic works differently: In particular a guest is free to +supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid) +entries. The guest will crash, but we already cope with that. + +Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(), +and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change +in security context, but add a respective assertion. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: eac000978c1feb5a9ee3236ab0c0da9a477e5336 +master date: 2022-10-11 14:22:24 +0200 +--- + xen/arch/x86/mm/shadow/common.c | 1 + + xen/arch/x86/mm/shadow/multi.c | 10 ++++++++++ + 2 files changed, 11 insertions(+) + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index abe6d4334382..0ab2ac6b7a3c 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2583,6 +2583,7 @@ void sh_set_toplevel_shadow(struct vcpu *v, + /* Now figure out the new contents: is this a valid guest MFN? */ + if ( !mfn_valid(gmfn) ) + { ++ ASSERT(mfn_eq(gmfn, INVALID_MFN)); + new_entry = pagetable_null(); + goto install_new_entry; + } +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index 9b43cb116c47..7e0494cf7faa 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -3697,6 +3697,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) + if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) + guest_flush_tlb_mask(d, d->dirty_cpumask); + sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); ++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) ++ { ++ ASSERT(d->is_dying || d->is_shutting_down); ++ return; ++ } + if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) + { + mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); +@@ -3757,6 +3762,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) + if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) + guest_flush_tlb_mask(d, d->dirty_cpumask); + sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); ++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) ++ { ++ ASSERT(d->is_dying || d->is_shutting_down); ++ return; ++ } + #else + #error This should never happen + #endif +-- +2.37.3 + diff --git a/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch b/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch new file mode 100644 index 0000000..62be72a --- /dev/null +++ b/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch @@ -0,0 +1,279 @@ +From 1e26afa846fb9a00b9155280eeae3b8cb8375dd6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 15:08:14 +0200 +Subject: [PATCH 46/67] x86/shadow: tolerate failure in shadow_prealloc() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Prevent _shadow_prealloc() from calling BUG() when unable to fulfill +the pre-allocation and instead return true/false. Modify +shadow_prealloc() to crash the domain on allocation failure (if the +domain is not already dying), as shadow cannot operate normally after +that. Modify callers to also gracefully handle {_,}shadow_prealloc() +failing to fulfill the request. + +Note this in turn requires adjusting the callers of +sh_make_monitor_table() also to handle it returning INVALID_MFN. +sh_update_paging_modes() is also modified to add additional error +paths in case of allocation failure, some of those will return with +null monitor page tables (and the domain likely crashed). This is no +different that current error paths, but the newly introduced ones are +more likely to trigger. + +The now added failure points in sh_update_paging_modes() also require +that on some error return paths the previous structures are cleared, +and thus monitor table is null. + +While there adjust the 'type' parameter type of shadow_prealloc() to +unsigned int rather than u32. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: b7f93c6afb12b6061e2d19de2f39ea09b569ac68 +master date: 2022-10-11 14:22:53 +0200 +--- + xen/arch/x86/mm/shadow/common.c | 69 ++++++++++++++++++++++++-------- + xen/arch/x86/mm/shadow/hvm.c | 4 +- + xen/arch/x86/mm/shadow/multi.c | 11 +++-- + xen/arch/x86/mm/shadow/private.h | 3 +- + 4 files changed, 66 insertions(+), 21 deletions(-) + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 0ab2ac6b7a3c..fc4f7f78ce43 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -36,6 +36,7 @@ + #include <asm/flushtlb.h> + #include <asm/shadow.h> + #include <xen/numa.h> ++#include <public/sched.h> + #include "private.h" + + DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); +@@ -927,14 +928,15 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) + + /* Make sure there are at least count order-sized pages + * available in the shadow page pool. */ +-static void _shadow_prealloc(struct domain *d, unsigned int pages) ++static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + { + struct vcpu *v; + struct page_info *sp, *t; + mfn_t smfn; + int i; + +- if ( d->arch.paging.shadow.free_pages >= pages ) return; ++ if ( d->arch.paging.shadow.free_pages >= pages ) ++ return true; + + /* Shouldn't have enabled shadows if we've no vcpus. */ + ASSERT(d->vcpu && d->vcpu[0]); +@@ -950,7 +952,8 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + sh_unpin(d, smfn); + + /* See if that freed up enough space */ +- if ( d->arch.paging.shadow.free_pages >= pages ) return; ++ if ( d->arch.paging.shadow.free_pages >= pages ) ++ return true; + } + + /* Stage two: all shadow pages are in use in hierarchies that are +@@ -973,7 +976,7 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + if ( d->arch.paging.shadow.free_pages >= pages ) + { + guest_flush_tlb_mask(d, d->dirty_cpumask); +- return; ++ return true; + } + } + } +@@ -986,7 +989,12 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + d->arch.paging.shadow.total_pages, + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); +- BUG(); ++ ++ ASSERT(d->is_dying); ++ ++ guest_flush_tlb_mask(d, d->dirty_cpumask); ++ ++ return false; + } + + /* Make sure there are at least count pages of the order according to +@@ -994,9 +1002,19 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + * This must be called before any calls to shadow_alloc(). Since this + * will free existing shadows to make room, it must be called early enough + * to avoid freeing shadows that the caller is currently working on. */ +-void shadow_prealloc(struct domain *d, u32 type, unsigned int count) ++bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + { +- return _shadow_prealloc(d, shadow_size(type) * count); ++ bool ret = _shadow_prealloc(d, shadow_size(type) * count); ++ ++ if ( !ret && !d->is_dying && ++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ /* ++ * Failing to allocate memory required for shadow usage can only result in ++ * a domain crash, do it here rather that relying on every caller to do it. ++ */ ++ domain_crash(d); ++ ++ return ret; + } + + /* Deliberately free all the memory we can: this will tear down all of +@@ -1215,7 +1233,7 @@ void shadow_free(struct domain *d, mfn_t smfn) + static struct page_info * + shadow_alloc_p2m_page(struct domain *d) + { +- struct page_info *pg; ++ struct page_info *pg = NULL; + + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ +@@ -1233,16 +1251,18 @@ shadow_alloc_p2m_page(struct domain *d) + d->arch.paging.shadow.p2m_pages, + shadow_min_acceptable_pages(d)); + } +- paging_unlock(d); +- return NULL; ++ goto out; + } + +- shadow_prealloc(d, SH_type_p2m_table, 1); ++ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) ) ++ goto out; ++ + pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); + d->arch.paging.shadow.p2m_pages++; + d->arch.paging.shadow.total_pages--; + ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + ++ out: + paging_unlock(d); + + return pg; +@@ -1333,7 +1353,9 @@ int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted) + else if ( d->arch.paging.shadow.total_pages > pages ) + { + /* Need to return memory to domheap */ +- _shadow_prealloc(d, 1); ++ if ( !_shadow_prealloc(d, 1) ) ++ return -ENOMEM; ++ + sp = page_list_remove_head(&d->arch.paging.shadow.freelist); + ASSERT(sp); + /* +@@ -2401,12 +2423,13 @@ static void sh_update_paging_modes(struct vcpu *v) + if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) + { + int i; ++ ++ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) ) ++ return; ++ + for(i = 0; i < SHADOW_OOS_PAGES; i++) +- { +- shadow_prealloc(d, SH_type_oos_snapshot, 1); + v->arch.paging.shadow.oos_snapshot[i] = + shadow_alloc(d, SH_type_oos_snapshot, 0); +- } + } + #endif /* OOS */ + +@@ -2470,6 +2493,9 @@ static void sh_update_paging_modes(struct vcpu *v) + mfn_t mmfn = sh_make_monitor_table( + v, v->arch.paging.mode->shadow.shadow_levels); + ++ if ( mfn_eq(mmfn, INVALID_MFN) ) ++ return; ++ + v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); + make_cr3(v, mmfn); + hvm_update_host_cr3(v); +@@ -2508,6 +2534,12 @@ static void sh_update_paging_modes(struct vcpu *v) + v->arch.hvm.monitor_table = pagetable_null(); + new_mfn = sh_make_monitor_table( + v, v->arch.paging.mode->shadow.shadow_levels); ++ if ( mfn_eq(new_mfn, INVALID_MFN) ) ++ { ++ sh_destroy_monitor_table(v, old_mfn, ++ old_mode->shadow.shadow_levels); ++ return; ++ } + v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn); + SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", + mfn_x(new_mfn)); +@@ -2593,7 +2625,12 @@ void sh_set_toplevel_shadow(struct vcpu *v, + if ( !mfn_valid(smfn) ) + { + /* Make sure there's enough free shadow memory. */ +- shadow_prealloc(d, root_type, 1); ++ if ( !shadow_prealloc(d, root_type, 1) ) ++ { ++ new_entry = pagetable_null(); ++ goto install_new_entry; ++ } ++ + /* Shadow the page. */ + smfn = make_shadow(v, gmfn, root_type); + } +diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c +index 87fc57704f25..d68796c495b7 100644 +--- a/xen/arch/x86/mm/shadow/hvm.c ++++ b/xen/arch/x86/mm/shadow/hvm.c +@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct vcpu *v, unsigned int shadow_levels) + ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table)); + + /* Guarantee we can get the memory we need */ +- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); ++ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) ) ++ return INVALID_MFN; ++ + m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); + mfn_to_page(m4mfn)->shadow_flags = 4; + +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index 7e0494cf7faa..6a9f82d39ce6 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -2825,9 +2825,14 @@ static int sh_page_fault(struct vcpu *v, + * Preallocate shadow pages *before* removing writable accesses + * otherwhise an OOS L1 might be demoted and promoted again with + * writable mappings. */ +- shadow_prealloc(d, +- SH_type_l1_shadow, +- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); ++ if ( !shadow_prealloc(d, SH_type_l1_shadow, ++ GUEST_PAGING_LEVELS < 4 ++ ? 1 : GUEST_PAGING_LEVELS - 1) ) ++ { ++ paging_unlock(d); ++ put_gfn(d, gfn_x(gfn)); ++ return 0; ++ } + + rc = gw_remove_write_accesses(v, va, &gw); + +diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h +index 911db46e7399..3fe0388e7c4f 100644 +--- a/xen/arch/x86/mm/shadow/private.h ++++ b/xen/arch/x86/mm/shadow/private.h +@@ -351,7 +351,8 @@ void shadow_promote(struct domain *d, mfn_t gmfn, u32 type); + void shadow_demote(struct domain *d, mfn_t gmfn, u32 type); + + /* Shadow page allocation functions */ +-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); ++bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type, ++ unsigned int count); + mfn_t shadow_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer); +-- +2.37.3 + diff --git a/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch b/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch new file mode 100644 index 0000000..c81cfab --- /dev/null +++ b/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch @@ -0,0 +1,100 @@ +From 4f9b535194f70582863f2a78f113547d8822b2b9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 15:08:28 +0200 +Subject: [PATCH 47/67] x86/p2m: refuse new allocations for dying domains +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This will in particular prevent any attempts to add entries to the p2m, +once - in a subsequent change - non-root entries have been removed. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: ff600a8cf8e36f8ecbffecf96a035952e022ab87 +master date: 2022-10-11 14:23:22 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 5 ++++- + xen/arch/x86/mm/shadow/common.c | 18 ++++++++++++++---- + 2 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index d75dc2b9ed3d..787991233e53 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struct domain *d) + + ASSERT(paging_locked_by_me(d)); + ++ if ( unlikely(d->is_dying) ) ++ return NULL; ++ + pg = page_list_remove_head(&d->arch.paging.hap.freelist); + if ( unlikely(!pg) ) + return NULL; +@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d) + d->arch.paging.hap.p2m_pages++; + ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + } +- else if ( !d->arch.paging.p2m_alloc_failed ) ++ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying ) + { + d->arch.paging.p2m_alloc_failed = 1; + dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index fc4f7f78ce43..9ad7e5a88650 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -938,6 +938,10 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + if ( d->arch.paging.shadow.free_pages >= pages ) + return true; + ++ if ( unlikely(d->is_dying) ) ++ /* No reclaim when the domain is dying, teardown will take care of it. */ ++ return false; ++ + /* Shouldn't have enabled shadows if we've no vcpus. */ + ASSERT(d->vcpu && d->vcpu[0]); + +@@ -990,7 +994,7 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); + +- ASSERT(d->is_dying); ++ ASSERT_UNREACHABLE(); + + guest_flush_tlb_mask(d, d->dirty_cpumask); + +@@ -1004,10 +1008,13 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + * to avoid freeing shadows that the caller is currently working on. */ + bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + { +- bool ret = _shadow_prealloc(d, shadow_size(type) * count); ++ bool ret; + +- if ( !ret && !d->is_dying && +- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ if ( unlikely(d->is_dying) ) ++ return false; ++ ++ ret = _shadow_prealloc(d, shadow_size(type) * count); ++ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) + /* + * Failing to allocate memory required for shadow usage can only result in + * a domain crash, do it here rather that relying on every caller to do it. +@@ -1235,6 +1242,9 @@ shadow_alloc_p2m_page(struct domain *d) + { + struct page_info *pg = NULL; + ++ if ( unlikely(d->is_dying) ) ++ return NULL; ++ + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); +-- +2.37.3 + diff --git a/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch b/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch new file mode 100644 index 0000000..c3d5a2c --- /dev/null +++ b/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch @@ -0,0 +1,115 @@ +From 7f055b011a657f8f16b0df242301efb312058eea Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 15:08:42 +0200 +Subject: [PATCH 48/67] x86/p2m: truly free paging pool memory for dying + domains +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Modify {hap,shadow}_free to free the page immediately if the domain is +dying, so that pages don't accumulate in the pool when +{shadow,hap}_final_teardown() get called. This is to limit the amount of +work which needs to be done there (in a non-preemptable manner). + +Note the call to shadow_free() in shadow_free_p2m_page() is moved after +increasing total_pages, so that the decrease done in shadow_free() in +case the domain is dying doesn't underflow the counter, even if just for +a short interval. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: f50a2c0e1d057c00d6061f40ae24d068226052ad +master date: 2022-10-11 14:23:51 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 12 ++++++++++++ + xen/arch/x86/mm/shadow/common.c | 28 +++++++++++++++++++++++++--- + 2 files changed, 37 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index 787991233e53..aef2297450e1 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, mfn_t mfn) + + ASSERT(paging_locked_by_me(d)); + ++ /* ++ * For dying domains, actually free the memory here. This way less work is ++ * left to hap_final_teardown(), which cannot easily have preemption checks ++ * added. ++ */ ++ if ( unlikely(d->is_dying) ) ++ { ++ free_domheap_page(pg); ++ d->arch.paging.hap.total_pages--; ++ return; ++ } ++ + d->arch.paging.hap.free_pages++; + page_list_add_tail(pg, &d->arch.paging.hap.freelist); + } +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 9ad7e5a88650..366956c146aa 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -1184,6 +1184,7 @@ mfn_t shadow_alloc(struct domain *d, + void shadow_free(struct domain *d, mfn_t smfn) + { + struct page_info *next = NULL, *sp = mfn_to_page(smfn); ++ bool dying = ACCESS_ONCE(d->is_dying); + struct page_list_head *pin_list; + unsigned int pages; + u32 shadow_type; +@@ -1226,11 +1227,32 @@ void shadow_free(struct domain *d, mfn_t smfn) + * just before the allocator hands the page out again. */ + page_set_tlbflush_timestamp(sp); + perfc_decr(shadow_alloc_count); +- page_list_add_tail(sp, &d->arch.paging.shadow.freelist); ++ ++ /* ++ * For dying domains, actually free the memory here. This way less ++ * work is left to shadow_final_teardown(), which cannot easily have ++ * preemption checks added. ++ */ ++ if ( unlikely(dying) ) ++ { ++ /* ++ * The backpointer field (sh.back) used by shadow code aliases the ++ * domain owner field, unconditionally clear it here to avoid ++ * free_domheap_page() attempting to parse it. ++ */ ++ page_set_owner(sp, NULL); ++ free_domheap_page(sp); ++ } ++ else ++ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); ++ + sp = next; + } + +- d->arch.paging.shadow.free_pages += pages; ++ if ( unlikely(dying) ) ++ d->arch.paging.shadow.total_pages -= pages; ++ else ++ d->arch.paging.shadow.free_pages += pages; + } + + /* Divert a page from the pool to be used by the p2m mapping. +@@ -1300,9 +1322,9 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg) + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); + +- shadow_free(d, page_to_mfn(pg)); + d->arch.paging.shadow.p2m_pages--; + d->arch.paging.shadow.total_pages++; ++ shadow_free(d, page_to_mfn(pg)); + + paging_unlock(d); + } +-- +2.37.3 + diff --git a/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch b/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch new file mode 100644 index 0000000..83502a6 --- /dev/null +++ b/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch @@ -0,0 +1,181 @@ +From 686c920fa9389fe2b6b619643024ed98b4b7d51f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 15:08:58 +0200 +Subject: [PATCH 49/67] x86/p2m: free the paging memory pool preemptively +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The paging memory pool is currently freed in two different places: +from {shadow,hap}_teardown() via domain_relinquish_resources() and +from {shadow,hap}_final_teardown() via complete_domain_destroy(). +While the former does handle preemption, the later doesn't. + +Attempt to move as much p2m related freeing as possible to happen +before the call to {shadow,hap}_teardown(), so that most memory can be +freed in a preemptive way. In order to avoid causing issues to +existing callers leave the root p2m page tables set and free them in +{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free +the page immediately if the domain is dying, so that pages don't +accumulate in the pool when {shadow,hap}_final_teardown() get called. + +Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's +the place where altp2m_active gets disabled now. + +This is part of CVE-2022-33746 / XSA-410. + +Reported-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: e7aa55c0aab36d994bf627c92bd5386ae167e16e +master date: 2022-10-11 14:24:21 +0200 +--- + xen/arch/x86/domain.c | 7 ------ + xen/arch/x86/mm/hap/hap.c | 42 ++++++++++++++++++++------------- + xen/arch/x86/mm/shadow/common.c | 12 ++++++++++ + 3 files changed, 38 insertions(+), 23 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 2838f976d729..ce6ddcf31397 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -38,7 +38,6 @@ + #include <xen/livepatch.h> + #include <public/sysctl.h> + #include <public/hvm/hvm_vcpu.h> +-#include <asm/altp2m.h> + #include <asm/regs.h> + #include <asm/mc146818rtc.h> + #include <asm/system.h> +@@ -2358,12 +2357,6 @@ int domain_relinquish_resources(struct domain *d) + vpmu_destroy(v); + } + +- if ( altp2m_active(d) ) +- { +- for_each_vcpu ( d, v ) +- altp2m_vcpu_disable_ve(v); +- } +- + if ( is_pv_domain(d) ) + { + for_each_vcpu ( d, v ) +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index aef2297450e1..a44fcfd95e1e 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -28,6 +28,7 @@ + #include <xen/domain_page.h> + #include <xen/guest_access.h> + #include <xen/keyhandler.h> ++#include <asm/altp2m.h> + #include <asm/event.h> + #include <asm/page.h> + #include <asm/current.h> +@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d) + unsigned int i; + + if ( hvm_altp2m_supported() ) +- { +- d->arch.altp2m_active = 0; +- +- if ( d->arch.altp2m_eptp ) +- { +- free_xenheap_page(d->arch.altp2m_eptp); +- d->arch.altp2m_eptp = NULL; +- } +- +- if ( d->arch.altp2m_visible_eptp ) +- { +- free_xenheap_page(d->arch.altp2m_visible_eptp); +- d->arch.altp2m_visible_eptp = NULL; +- } +- + for ( i = 0; i < MAX_ALTP2M; i++ ) + p2m_teardown(d->arch.altp2m_p2m[i], true); +- } + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d) + paging_lock(d); + hap_set_allocation(d, 0, NULL); + ASSERT(d->arch.paging.hap.p2m_pages == 0); ++ ASSERT(d->arch.paging.hap.free_pages == 0); ++ ASSERT(d->arch.paging.hap.total_pages == 0); + paging_unlock(d); + } + +@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v) + void hap_teardown(struct domain *d, bool *preempted) + { + struct vcpu *v; ++ unsigned int i; + + ASSERT(d->is_dying); + ASSERT(d != current->domain); +@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool *preempted) + for_each_vcpu ( d, v ) + hap_vcpu_teardown(v); + ++ /* Leave the root pt in case we get further attempts to modify the p2m. */ ++ if ( hvm_altp2m_supported() ) ++ { ++ if ( altp2m_active(d) ) ++ for_each_vcpu ( d, v ) ++ altp2m_vcpu_disable_ve(v); ++ ++ d->arch.altp2m_active = 0; ++ ++ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp); ++ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); ++ ++ for ( i = 0; i < MAX_ALTP2M; i++ ) ++ p2m_teardown(d->arch.altp2m_p2m[i], false); ++ } ++ ++ /* Destroy nestedp2m's after altp2m. */ ++ for ( i = 0; i < MAX_NESTEDP2M; i++ ) ++ p2m_teardown(d->arch.nested_p2m[i], false); ++ ++ p2m_teardown(p2m_get_hostp2m(d), false); ++ + paging_lock(d); /* Keep various asserts happy */ + + if ( d->arch.paging.hap.total_pages != 0 ) +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 366956c146aa..680766fd5170 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2891,8 +2891,17 @@ void shadow_teardown(struct domain *d, bool *preempted) + for_each_vcpu ( d, v ) + shadow_vcpu_teardown(v); + ++ p2m_teardown(p2m_get_hostp2m(d), false); ++ + paging_lock(d); + ++ /* ++ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find ++ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages ++ * because the domain is dying. ++ */ ++ shadow_blow_tables(d); ++ + #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) + /* Free the virtual-TLB array attached to each vcpu */ + for_each_vcpu(d, v) +@@ -3013,6 +3022,9 @@ void shadow_final_teardown(struct domain *d) + d->arch.paging.shadow.total_pages, + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); ++ ASSERT(!d->arch.paging.shadow.total_pages); ++ ASSERT(!d->arch.paging.shadow.free_pages); ++ ASSERT(!d->arch.paging.shadow.p2m_pages); + paging_unlock(d); + } + +-- +2.37.3 + diff --git a/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch b/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch new file mode 100644 index 0000000..23e10ba --- /dev/null +++ b/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch @@ -0,0 +1,197 @@ +From b03074bb47d10c9373688b3661c7c31da01c21a3 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 11 Oct 2022 15:09:12 +0200 +Subject: [PATCH 50/67] xen/x86: p2m: Add preemption in p2m_teardown() + +The list p2m->pages contain all the pages used by the P2M. On large +instance this can be quite large and the time spent to call +d->arch.paging.free_page() will take more than 1ms for a 80GB guest +on a Xen running in nested environment on a c5.metal. + +By extrapolation, it would take > 100ms for a 8TB guest (what we +current security support). So add some preemption in p2m_teardown() +and propagate to the callers. Note there are 3 places where +the preemption is not enabled: + - hap_final_teardown()/shadow_final_teardown(): We are + preventing update the P2M once the domain is dying (so + no more pages could be allocated) and most of the P2M pages + will be freed in preemptive manneer when relinquishing the + resources. So this is fine to disable preemption. + - shadow_enable(): This is fine because it will undo the allocation + that may have been made by p2m_alloc_table() (so only the root + page table). + +The preemption is arbitrarily checked every 1024 iterations. + +We now need to include <xen/event.h> in p2m-basic in order to +import the definition for local_events_need_delivery() used by +general_preempt_check(). Ideally, the inclusion should happen in +xen/sched.h but it opened a can of worms. + +Note that with the current approach, Xen doesn't keep track on whether +the alt/nested P2Ms have been cleared. So there are some redundant work. +However, this is not expected to incurr too much overhead (the P2M lock +shouldn't be contended during teardown). So this is optimization is +left outside of the security event. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +master commit: 8a2111250b424edc49c65c4d41b276766d30635c +master date: 2022-10-11 14:24:48 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 22 ++++++++++++++++------ + xen/arch/x86/mm/p2m.c | 18 +++++++++++++++--- + xen/arch/x86/mm/shadow/common.c | 12 +++++++++--- + xen/include/asm-x86/p2m.h | 2 +- + 4 files changed, 41 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index a44fcfd95e1e..1f9a157a0c34 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d) + + if ( hvm_altp2m_supported() ) + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i], true); ++ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL); + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +- p2m_teardown(d->arch.nested_p2m[i], true); ++ p2m_teardown(d->arch.nested_p2m[i], true, NULL); + } + + if ( d->arch.paging.hap.total_pages != 0 ) + hap_teardown(d, NULL); + +- p2m_teardown(p2m_get_hostp2m(d), true); ++ p2m_teardown(p2m_get_hostp2m(d), true, NULL); + /* Free any memory that the p2m teardown released */ + paging_lock(d); + hap_set_allocation(d, 0, NULL); +@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool *preempted) + FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); + + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i], false); ++ { ++ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted); ++ if ( preempted && *preempted ) ++ return; ++ } + } + + /* Destroy nestedp2m's after altp2m. */ + for ( i = 0; i < MAX_NESTEDP2M; i++ ) +- p2m_teardown(d->arch.nested_p2m[i], false); ++ { ++ p2m_teardown(d->arch.nested_p2m[i], false, preempted); ++ if ( preempted && *preempted ) ++ return; ++ } + +- p2m_teardown(p2m_get_hostp2m(d), false); ++ p2m_teardown(p2m_get_hostp2m(d), false, preempted); ++ if ( preempted && *preempted ) ++ return; + + paging_lock(d); /* Keep various asserts happy */ + +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index 8ba73082c1bf..107f6778a6e1 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -741,12 +741,13 @@ int p2m_alloc_table(struct p2m_domain *p2m) + * hvm fixme: when adding support for pvh non-hardware domains, this path must + * cleanup any foreign p2m types (release refcnts on them). + */ +-void p2m_teardown(struct p2m_domain *p2m, bool remove_root) ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted) + /* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ + { + struct page_info *pg, *root_pg = NULL; + struct domain *d; ++ unsigned int i = 0; + + if (p2m == NULL) + return; +@@ -765,8 +766,19 @@ void p2m_teardown(struct p2m_domain *p2m, bool remove_root) + } + + while ( (pg = page_list_remove_head(&p2m->pages)) ) +- if ( pg != root_pg ) +- d->arch.paging.free_page(d, pg); ++ { ++ if ( pg == root_pg ) ++ continue; ++ ++ d->arch.paging.free_page(d, pg); ++ ++ /* Arbitrarily check preemption every 1024 iterations */ ++ if ( preempted && !(++i % 1024) && general_preempt_check() ) ++ { ++ *preempted = true; ++ break; ++ } ++ } + + if ( root_pg ) + page_list_add(root_pg, &p2m->pages); +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 680766fd5170..8f7fddcee1e5 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2837,8 +2837,12 @@ int shadow_enable(struct domain *d, u32 mode) + out_locked: + paging_unlock(d); + out_unlocked: ++ /* ++ * This is fine to ignore the preemption here because only the root ++ * will be allocated by p2m_alloc_table(). ++ */ + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) +- p2m_teardown(p2m, true); ++ p2m_teardown(p2m, true, NULL); + if ( rv != 0 && pg != NULL ) + { + pg->count_info &= ~PGC_count_mask; +@@ -2891,7 +2895,9 @@ void shadow_teardown(struct domain *d, bool *preempted) + for_each_vcpu ( d, v ) + shadow_vcpu_teardown(v); + +- p2m_teardown(p2m_get_hostp2m(d), false); ++ p2m_teardown(p2m_get_hostp2m(d), false, preempted); ++ if ( preempted && *preempted ) ++ return; + + paging_lock(d); + +@@ -3012,7 +3018,7 @@ void shadow_final_teardown(struct domain *d) + shadow_teardown(d, NULL); + + /* It is now safe to pull down the p2m map. */ +- p2m_teardown(p2m_get_hostp2m(d), true); ++ p2m_teardown(p2m_get_hostp2m(d), true, NULL); + /* Free any shadow memory that the p2m teardown released */ + paging_lock(d); + shadow_set_allocation(d, 0, NULL); +diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h +index 46eb51d44cf5..edbe4cee2717 100644 +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -619,7 +619,7 @@ int p2m_init(struct domain *d); + int p2m_alloc_table(struct p2m_domain *p2m); + + /* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct p2m_domain *p2m, bool remove_root); ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted); + void p2m_final_teardown(struct domain *d); + + /* Add a page to a domain's p2m table */ +-- +2.37.3 + diff --git a/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch b/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch new file mode 100644 index 0000000..f3bded4 --- /dev/null +++ b/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch @@ -0,0 +1,147 @@ +From 0c0680d6e7953ca4c91699e60060c732f9ead5c1 Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 15:09:32 +0200 +Subject: [PATCH 51/67] libxl, docs: Use arch-specific default paging memory + +The default paging memory (descibed in `shadow_memory` entry in xl +config) in libxl is used to determine the memory pool size for xl +guests. Currently this size is only used for x86, and contains a part +of RAM to shadow the resident processes. Since on Arm there is no +shadow mode guests, so the part of RAM to shadow the resident processes +is not necessary. Therefore, this commit splits the function +`libxl_get_required_shadow_memory()` to arch specific helpers and +renamed the helper to `libxl__arch_get_required_paging_memory()`. + +On x86, this helper calls the original value from +`libxl_get_required_shadow_memory()` so no functional change intended. + +On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM +for the P2M map and additional 512KB. + +Also update the xl.cfg documentation to add Arm documentation +according to code changes and correct the comment style following Xen +coding style. + +This is part of CVE-2022-33747 / XSA-409. + +Suggested-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: 156a239ea288972425f967ac807b3cb5b5e14874 +master date: 2022-10-11 14:28:37 +0200 +--- + docs/man/xl.cfg.5.pod.in | 5 +++++ + tools/libs/light/libxl_arch.h | 4 ++++ + tools/libs/light/libxl_arm.c | 12 ++++++++++++ + tools/libs/light/libxl_utils.c | 9 ++------- + tools/libs/light/libxl_x86.c | 13 +++++++++++++ + 5 files changed, 36 insertions(+), 7 deletions(-) + +diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in +index 56370a37dbb1..af7fae7c52f9 100644 +--- a/docs/man/xl.cfg.5.pod.in ++++ b/docs/man/xl.cfg.5.pod.in +@@ -1746,6 +1746,11 @@ are not using hardware assisted paging (i.e. you are using shadow + mode) and your guest workload consists of a very large number of + similar processes then increasing this value may improve performance. + ++On Arm, this field is used to determine the size of the guest P2M pages ++pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for ++the P2M map. Users should adjust this value if bigger P2M pool size is ++needed. ++ + =back + + =head3 Processor and Platform Features +diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h +index 8527fc5c6c23..6741b7f6f457 100644 +--- a/tools/libs/light/libxl_arch.h ++++ b/tools/libs/light/libxl_arch.h +@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc, + libxl_domain_config *dst, + const libxl_domain_config *src); + ++_hidden ++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, ++ unsigned int smp_cpus); ++ + #if defined(__i386__) || defined(__x86_64__) + + #define LAPIC_BASE_ADDRESS 0xfee00000 +diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c +index e2901f13b724..d59b464192c2 100644 +--- a/tools/libs/light/libxl_arm.c ++++ b/tools/libs/light/libxl_arm.c +@@ -154,6 +154,18 @@ out: + return rc; + } + ++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, ++ unsigned int smp_cpus) ++{ ++ /* ++ * 256 pages (1MB) per vcpu, ++ * plus 1 page per MiB of RAM for the P2M map, ++ * This is higher than the minimum that Xen would allocate if no value ++ * were given (but the Xen minimum is for safety, not performance). ++ */ ++ return 4 * (256 * smp_cpus + maxmem_kb / 1024); ++} ++ + static struct arch_info { + const char *guest_type; + const char *timer_compat; +diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c +index 4699c4a0a36f..e276c0ee9cc3 100644 +--- a/tools/libs/light/libxl_utils.c ++++ b/tools/libs/light/libxl_utils.c +@@ -18,6 +18,7 @@ + #include <ctype.h> + + #include "libxl_internal.h" ++#include "libxl_arch.h" + #include "_paths.h" + + #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE +@@ -39,13 +40,7 @@ char *libxl_basename(const char *name) + + unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus) + { +- /* 256 pages (1MB) per vcpu, +- plus 1 page per MiB of RAM for the P2M map, +- plus 1 page per MiB of RAM to shadow the resident processes. +- This is higher than the minimum that Xen would allocate if no value +- were given (but the Xen minimum is for safety, not performance). +- */ +- return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); ++ return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus); + } + + char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid) +diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c +index 18c3c77ccde3..4d66478fe9dd 100644 +--- a/tools/libs/light/libxl_x86.c ++++ b/tools/libs/light/libxl_x86.c +@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc, + libxl_defbool_val(src->b_info.arch_x86.msr_relaxed)); + } + ++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, ++ unsigned int smp_cpus) ++{ ++ /* ++ * 256 pages (1MB) per vcpu, ++ * plus 1 page per MiB of RAM for the P2M map, ++ * plus 1 page per MiB of RAM to shadow the resident processes. ++ * This is higher than the minimum that Xen would allocate if no value ++ * were given (but the Xen minimum is for safety, not performance). ++ */ ++ return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); ++} ++ + /* + * Local variables: + * mode: C +-- +2.37.3 + diff --git a/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch new file mode 100644 index 0000000..77093a7 --- /dev/null +++ b/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch @@ -0,0 +1,189 @@ +From 45336d8f88725aec65ee177b1b09abf6eef1dc8d Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 15:09:58 +0200 +Subject: [PATCH 52/67] xen/arm: Construct the P2M pages pool for guests + +This commit constructs the p2m pages pool for guests from the +data structure and helper perspective. + +This is implemented by: + +- Adding a `struct paging_domain` which contains a freelist, a +counter variable and a spinlock to `struct arch_domain` to +indicate the free p2m pages and the number of p2m total pages in +the p2m pages pool. + +- Adding a helper `p2m_get_allocation` to get the p2m pool size. + +- Adding a helper `p2m_set_allocation` to set the p2m pages pool +size. This helper should be called before allocating memory for +a guest. + +- Adding a helper `p2m_teardown_allocation` to free the p2m pages +pool. This helper should be called during the xl domain destory. + +This is part of CVE-2022-33747 / XSA-409. + +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 55914f7fc91a468649b8a3ec3f53ae1c4aca6670 +master date: 2022-10-11 14:28:39 +0200 +--- + xen/arch/arm/p2m.c | 88 ++++++++++++++++++++++++++++++++++++ + xen/include/asm-arm/domain.h | 10 ++++ + xen/include/asm-arm/p2m.h | 4 ++ + 3 files changed, 102 insertions(+) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 4ad3e0606e9c..6883d8627702 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) + return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); + } + ++/* Return the size of the pool, rounded up to the nearest MB */ ++unsigned int p2m_get_allocation(struct domain *d) ++{ ++ unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages); ++ ++ return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT); ++} ++ ++/* ++ * Set the pool of pages to the required number of pages. ++ * Returns 0 for success, non-zero for failure. ++ * Call with d->arch.paging.lock held. ++ */ ++int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted) ++{ ++ struct page_info *pg; ++ ++ ASSERT(spin_is_locked(&d->arch.paging.lock)); ++ ++ for ( ; ; ) ++ { ++ if ( d->arch.paging.p2m_total_pages < pages ) ++ { ++ /* Need to allocate more memory from domheap */ ++ pg = alloc_domheap_page(NULL, 0); ++ if ( pg == NULL ) ++ { ++ printk(XENLOG_ERR "Failed to allocate P2M pages.\n"); ++ return -ENOMEM; ++ } ++ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = ++ d->arch.paging.p2m_total_pages + 1; ++ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); ++ } ++ else if ( d->arch.paging.p2m_total_pages > pages ) ++ { ++ /* Need to return memory to domheap */ ++ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); ++ if( pg ) ++ { ++ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = ++ d->arch.paging.p2m_total_pages - 1; ++ free_domheap_page(pg); ++ } ++ else ++ { ++ printk(XENLOG_ERR ++ "Failed to free P2M pages, P2M freelist is empty.\n"); ++ return -ENOMEM; ++ } ++ } ++ else ++ break; ++ ++ /* Check to see if we need to yield and try again */ ++ if ( preempted && general_preempt_check() ) ++ { ++ *preempted = true; ++ return -ERESTART; ++ } ++ } ++ ++ return 0; ++} ++ ++int p2m_teardown_allocation(struct domain *d) ++{ ++ int ret = 0; ++ bool preempted = false; ++ ++ spin_lock(&d->arch.paging.lock); ++ if ( d->arch.paging.p2m_total_pages != 0 ) ++ { ++ ret = p2m_set_allocation(d, 0, &preempted); ++ if ( preempted ) ++ { ++ spin_unlock(&d->arch.paging.lock); ++ return -ERESTART; ++ } ++ ASSERT(d->arch.paging.p2m_total_pages == 0); ++ } ++ spin_unlock(&d->arch.paging.lock); ++ ++ return ret; ++} ++ + /* Unlock the flush and do a P2M TLB flush if necessary */ + void p2m_write_unlock(struct p2m_domain *p2m) + { +@@ -1602,7 +1688,9 @@ int p2m_init(struct domain *d) + unsigned int cpu; + + rwlock_init(&p2m->lock); ++ spin_lock_init(&d->arch.paging.lock); + INIT_PAGE_LIST_HEAD(&p2m->pages); ++ INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); + + p2m->vmid = INVALID_VMID; + +diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h +index bb0a6adbe00b..1d8935778f3b 100644 +--- a/xen/include/asm-arm/domain.h ++++ b/xen/include/asm-arm/domain.h +@@ -40,6 +40,14 @@ struct vtimer { + uint64_t cval; + }; + ++struct paging_domain { ++ spinlock_t lock; ++ /* Free P2M pages from the pre-allocated P2M pool */ ++ struct page_list_head p2m_freelist; ++ /* Number of pages from the pre-allocated P2M pool */ ++ unsigned long p2m_total_pages; ++}; ++ + struct arch_domain + { + #ifdef CONFIG_ARM_64 +@@ -51,6 +59,8 @@ struct arch_domain + + struct hvm_domain hvm; + ++ struct paging_domain paging; ++ + struct vmmio vmmio; + + /* Continuable domain_relinquish_resources(). */ +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index 3a2d51b35d71..18675b234570 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n); + /* Print debugging/statistial info about a domain's p2m */ + void p2m_dump_info(struct domain *d); + ++unsigned int p2m_get_allocation(struct domain *d); ++int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted); ++int p2m_teardown_allocation(struct domain *d); ++ + static inline void p2m_write_lock(struct p2m_domain *p2m) + { + write_lock(&p2m->lock); +-- +2.37.3 + diff --git a/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch new file mode 100644 index 0000000..52ce67c --- /dev/null +++ b/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch @@ -0,0 +1,108 @@ +From c5215044578e88b401a1296ed6302df05c113c5f Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 15:10:16 +0200 +Subject: [PATCH 53/67] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm + +This commit implements the `XEN_DOMCTL_shadow_op` support in Xen +for Arm. The p2m pages pool size for xl guests is supposed to be +determined by `XEN_DOMCTL_shadow_op`. Hence, this commit: + +- Introduces a function `p2m_domctl` and implements the subops +`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and +`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`. + +- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl. + +Therefore enabling the setting of shadow memory pool size +when creating a guest from xl and getting shadow memory pool size +from Xen. + +Note that the `XEN_DOMCTL_shadow_op` added in this commit is only +a dummy op, and the functionality of setting/getting p2m memory pool +size for xl guests will be added in following commits. + +This is part of CVE-2022-33747 / XSA-409. + +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: cf2a68d2ffbc3ce95e01449d46180bddb10d24a0 +master date: 2022-10-11 14:28:42 +0200 +--- + tools/libs/light/libxl_arm.c | 12 ++++++++++++ + xen/arch/arm/domctl.c | 32 ++++++++++++++++++++++++++++++++ + 2 files changed, 44 insertions(+) + +diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c +index d59b464192c2..d21f614ed788 100644 +--- a/tools/libs/light/libxl_arm.c ++++ b/tools/libs/light/libxl_arm.c +@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc, + libxl__domain_build_state *state, + uint32_t domid) + { ++ libxl_ctx *ctx = libxl__gc_owner(gc); ++ unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); ++ ++ int r = xc_shadow_control(ctx->xch, domid, ++ XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, ++ &shadow_mb, 0); ++ if (r) { ++ LOGED(ERROR, domid, ++ "Failed to set %u MiB shadow allocation", shadow_mb); ++ return ERROR_FAIL; ++ } ++ + return 0; + } + +diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c +index a8c48b0beaab..a049bc7f3e52 100644 +--- a/xen/arch/arm/domctl.c ++++ b/xen/arch/arm/domctl.c +@@ -45,11 +45,43 @@ static int handle_vuart_init(struct domain *d, + return rc; + } + ++static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, ++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) ++{ ++ if ( unlikely(d == current->domain) ) ++ { ++ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); ++ return -EINVAL; ++ } ++ ++ if ( unlikely(d->is_dying) ) ++ { ++ printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n", ++ d->domain_id); ++ return -EINVAL; ++ } ++ ++ switch ( sc->op ) ++ { ++ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: ++ return 0; ++ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: ++ return 0; ++ default: ++ { ++ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); ++ return -EINVAL; ++ } ++ } ++} ++ + long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, + XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + { + switch ( domctl->cmd ) + { ++ case XEN_DOMCTL_shadow_op: ++ return p2m_domctl(d, &domctl->u.shadow_op, u_domctl); + case XEN_DOMCTL_cacheflush: + { + gfn_t s = _gfn(domctl->u.cacheflush.start_pfn); +-- +2.37.3 + diff --git a/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch new file mode 100644 index 0000000..3ef7019 --- /dev/null +++ b/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch @@ -0,0 +1,289 @@ +From 7ad38a39f08aadc1578bdb46ccabaad79ed0faee Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 15:10:34 +0200 +Subject: [PATCH 54/67] xen/arm: Allocate and free P2M pages from the P2M pool + +This commit sets/tearsdown of p2m pages pool for non-privileged Arm +guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`. + +- For dom0, P2M pages should come from heap directly instead of p2m +pool, so that the kernel may take advantage of the extended regions. + +- For xl guests, the setting of the p2m pool is called in +`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in +`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is +updated with the new size when setting the p2m pool. + +- For dom0less domUs, the setting of the p2m pool is called before +allocating memory during domain creation. Users can specify the p2m +pool size by `xen,domain-p2m-mem-mb` dts property. + +To actually allocate/free pages from the p2m pool, this commit adds +two helper functions namely `p2m_alloc_page` and `p2m_free_page` to +`struct p2m_domain`. By replacing the `alloc_domheap_page` and +`free_domheap_page` with these two helper functions, p2m pages can +be added/removed from the list of p2m pool rather than from the heap. + +Since page from `p2m_alloc_page` is cleaned, take the opportunity +to remove the redundant `clean_page` in `p2m_create_table`. + +This is part of CVE-2022-33747 / XSA-409. + +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: cbea5a1149ca7fd4b7cdbfa3ec2e4f109b601ff7 +master date: 2022-10-11 14:28:44 +0200 +--- + docs/misc/arm/device-tree/booting.txt | 8 ++++ + xen/arch/arm/domain.c | 6 +++ + xen/arch/arm/domain_build.c | 29 ++++++++++++++ + xen/arch/arm/domctl.c | 23 ++++++++++- + xen/arch/arm/p2m.c | 57 +++++++++++++++++++++++++-- + 5 files changed, 118 insertions(+), 5 deletions(-) + +diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt +index 5243bc7fd344..470c9491a781 100644 +--- a/docs/misc/arm/device-tree/booting.txt ++++ b/docs/misc/arm/device-tree/booting.txt +@@ -164,6 +164,14 @@ with the following properties: + Both #address-cells and #size-cells need to be specified because + both sub-nodes (described shortly) have reg properties. + ++- xen,domain-p2m-mem-mb ++ ++ Optional. A 32-bit integer specifying the amount of megabytes of RAM ++ used for the domain P2M pool. This is in-sync with the shadow_memory ++ option in xl.cfg. Leaving this field empty in device tree will lead to ++ the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB ++ per MB of guest RAM plus 512KB for guest extended regions. ++ + Under the "xen,domain" compatible node, one or more sub-nodes are present + for the DomU kernel and ramdisk. + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index 223ec9694df1..a5ffd952ecd0 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -985,6 +985,7 @@ enum { + PROG_page, + PROG_mapping, + PROG_p2m, ++ PROG_p2m_pool, + PROG_done, + }; + +@@ -1044,6 +1045,11 @@ int domain_relinquish_resources(struct domain *d) + if ( ret ) + return ret; + ++ PROGRESS(p2m_pool): ++ ret = p2m_teardown_allocation(d); ++ if( ret ) ++ return ret; ++ + PROGRESS(done): + break; + +diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c +index 26c13429488d..df0ec84f034c 100644 +--- a/xen/arch/arm/domain_build.c ++++ b/xen/arch/arm/domain_build.c +@@ -2333,6 +2333,21 @@ static void __init find_gnttab_region(struct domain *d, + kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size); + } + ++static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb, ++ unsigned int smp_cpus) ++{ ++ /* ++ * Keep in sync with libxl__get_required_paging_memory(). ++ * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map, ++ * plus 128 pages to cover extended regions. ++ */ ++ unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128); ++ ++ BUILD_BUG_ON(PAGE_SIZE != SZ_4K); ++ ++ return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT); ++} ++ + static int __init construct_domain(struct domain *d, struct kernel_info *kinfo) + { + unsigned int i; +@@ -2424,6 +2439,8 @@ static int __init construct_domU(struct domain *d, + struct kernel_info kinfo = {}; + int rc; + u64 mem; ++ u32 p2m_mem_mb; ++ unsigned long p2m_pages; + + rc = dt_property_read_u64(node, "memory", &mem); + if ( !rc ) +@@ -2433,6 +2450,18 @@ static int __init construct_domU(struct domain *d, + } + kinfo.unassigned_mem = (paddr_t)mem * SZ_1K; + ++ rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb); ++ /* If xen,domain-p2m-mem-mb is not specified, use the default value. */ ++ p2m_pages = rc ? ++ p2m_mem_mb << (20 - PAGE_SHIFT) : ++ domain_p2m_pages(mem, d->max_vcpus); ++ ++ spin_lock(&d->arch.paging.lock); ++ rc = p2m_set_allocation(d, p2m_pages, NULL); ++ spin_unlock(&d->arch.paging.lock); ++ if ( rc != 0 ) ++ return rc; ++ + printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem); + + kinfo.vpl011 = dt_property_read_bool(node, "vpl011"); +diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c +index a049bc7f3e52..4ab5ed4ab24d 100644 +--- a/xen/arch/arm/domctl.c ++++ b/xen/arch/arm/domctl.c +@@ -48,6 +48,9 @@ static int handle_vuart_init(struct domain *d, + static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, + XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + { ++ long rc; ++ bool preempted = false; ++ + if ( unlikely(d == current->domain) ) + { + printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); +@@ -64,9 +67,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, + switch ( sc->op ) + { + case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: +- return 0; ++ { ++ /* Allow and handle preemption */ ++ spin_lock(&d->arch.paging.lock); ++ rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); ++ spin_unlock(&d->arch.paging.lock); ++ ++ if ( preempted ) ++ /* Not finished. Set up to re-run the call. */ ++ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", ++ u_domctl); ++ else ++ /* Finished. Return the new allocation. */ ++ sc->mb = p2m_get_allocation(d); ++ ++ return rc; ++ } + case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: ++ { ++ sc->mb = p2m_get_allocation(d); + return 0; ++ } + default: + { + printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 6883d8627702..c1055ff2a745 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) + return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); + } + ++static struct page_info *p2m_alloc_page(struct domain *d) ++{ ++ struct page_info *pg; ++ ++ spin_lock(&d->arch.paging.lock); ++ /* ++ * For hardware domain, there should be no limit in the number of pages that ++ * can be allocated, so that the kernel may take advantage of the extended ++ * regions. Hence, allocate p2m pages for hardware domains from heap. ++ */ ++ if ( is_hardware_domain(d) ) ++ { ++ pg = alloc_domheap_page(NULL, 0); ++ if ( pg == NULL ) ++ { ++ printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n"); ++ spin_unlock(&d->arch.paging.lock); ++ return NULL; ++ } ++ } ++ else ++ { ++ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); ++ if ( unlikely(!pg) ) ++ { ++ spin_unlock(&d->arch.paging.lock); ++ return NULL; ++ } ++ d->arch.paging.p2m_total_pages--; ++ } ++ spin_unlock(&d->arch.paging.lock); ++ ++ return pg; ++} ++ ++static void p2m_free_page(struct domain *d, struct page_info *pg) ++{ ++ spin_lock(&d->arch.paging.lock); ++ if ( is_hardware_domain(d) ) ++ free_domheap_page(pg); ++ else ++ { ++ d->arch.paging.p2m_total_pages++; ++ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); ++ } ++ spin_unlock(&d->arch.paging.lock); ++} ++ + /* Return the size of the pool, rounded up to the nearest MB */ + unsigned int p2m_get_allocation(struct domain *d) + { +@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry) + + ASSERT(!p2m_is_valid(*entry)); + +- page = alloc_domheap_page(NULL, 0); ++ page = p2m_alloc_page(p2m->domain); + if ( page == NULL ) + return -ENOMEM; + +@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m, + pg = mfn_to_page(mfn); + + page_list_del(pg, &p2m->pages); +- free_domheap_page(pg); ++ p2m_free_page(p2m->domain, pg); + } + + static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, +@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, + ASSERT(level < target); + ASSERT(p2m_is_superpage(*entry, level)); + +- page = alloc_domheap_page(NULL, 0); ++ page = p2m_alloc_page(p2m->domain); + if ( !page ) + return false; + +@@ -1644,7 +1692,7 @@ int p2m_teardown(struct domain *d) + + while ( (pg = page_list_remove_head(&p2m->pages)) ) + { +- free_domheap_page(pg); ++ p2m_free_page(p2m->domain, pg); + count++; + /* Arbitrarily preempt every 512 iterations */ + if ( !(count % 512) && hypercall_preempt_check() ) +@@ -1668,6 +1716,7 @@ void p2m_final_teardown(struct domain *d) + return; + + ASSERT(page_list_empty(&p2m->pages)); ++ ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); + + if ( p2m->root ) + free_domheap_pages(p2m->root, P2M_ROOT_ORDER); +-- +2.37.3 + diff --git a/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch b/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch new file mode 100644 index 0000000..be83ce5 --- /dev/null +++ b/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch @@ -0,0 +1,66 @@ +From bb43a10fefe494ab747b020fef3e823b63fc566d Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:11:01 +0200 +Subject: [PATCH 55/67] gnttab: correct locking on transitive grant copy error + path + +While the comment next to the lock dropping in preparation of +recursively calling acquire_grant_for_copy() mistakenly talks about the +rd == td case (excluded a few lines further up), the same concerns apply +to the calling of release_grant_for_copy() on a subsequent error path. + +This is CVE-2022-33748 / XSA-411. + +Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +master commit: 6e3aab858eef614a21a782a3b73acc88e74690ea +master date: 2022-10-11 14:29:30 +0200 +--- + xen/common/grant_table.c | 19 ++++++++++++++++--- + 1 file changed, 16 insertions(+), 3 deletions(-) + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 77bba9806937..0523beb9b734 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2608,9 +2608,8 @@ acquire_grant_for_copy( + trans_domid); + + /* +- * acquire_grant_for_copy() could take the lock on the +- * remote table (if rd == td), so we have to drop the lock +- * here and reacquire. ++ * acquire_grant_for_copy() will take the lock on the remote table, ++ * so we have to drop the lock here and reacquire. + */ + active_entry_release(act); + grant_read_unlock(rgt); +@@ -2647,11 +2646,25 @@ acquire_grant_for_copy( + act->trans_gref != trans_gref || + !act->is_sub_page)) ) + { ++ /* ++ * Like above for acquire_grant_for_copy() we need to drop and then ++ * re-acquire the locks here to prevent lock order inversion issues. ++ * Unlike for acquire_grant_for_copy() we don't need to re-check ++ * anything, as release_grant_for_copy() doesn't depend on the grant ++ * table entry: It only updates internal state and the status flags. ++ */ ++ active_entry_release(act); ++ grant_read_unlock(rgt); ++ + release_grant_for_copy(td, trans_gref, readonly); + rcu_unlock_domain(td); ++ ++ grant_read_lock(rgt); ++ act = active_entry_acquire(rgt, gref); + reduce_status_for_pin(rd, act, status, readonly); + active_entry_release(act); + grant_read_unlock(rgt); ++ + put_page(*page); + *page = NULL; + return ERESTART; +-- +2.37.3 + diff --git a/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch b/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch new file mode 100644 index 0000000..c5d2c9c --- /dev/null +++ b/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch @@ -0,0 +1,112 @@ +From d65ebacb78901b695bc5e8a075ad1ad865a78928 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 11 Oct 2022 15:13:15 +0200 +Subject: [PATCH 56/67] tools/libxl: Replace deprecated -soundhw on QEMU + command line + +-soundhw is deprecated since 825ff02911c9 ("audio: add soundhw +deprecation notice"), QEMU v5.1, and is been remove for upcoming v7.1 +by 039a68373c45 ("introduce -audio as a replacement for -soundhw"). + +Instead we can just add the sound card with "-device", for most option +that "-soundhw" could handle. "-device" is an option that existed +before QEMU 1.0, and could already be used to add audio hardware. + +The list of possible option for libxl's "soundhw" is taken the list +from QEMU 7.0. + +The list of options for "soundhw" are listed in order of preference in +the manual. The first three (hda, ac97, es1370) are PCI devices and +easy to test on Linux, and the last four are ISA devices which doesn't +seems to work out of the box on linux. + +The sound card 'pcspk' isn't listed even if it used to be accepted by +'-soundhw' because QEMU crash when trying to add it to a Xen domain. +Also, it wouldn't work with "-device" might need to be "-machine +pcspk-audiodev=default" instead. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Reviewed-by: Jason Andryuk <jandryuk@gmail.com> +master commit: 62ca138c2c052187783aca3957d3f47c4dcfd683 +master date: 2022-08-18 09:25:50 +0200 +--- + docs/man/xl.cfg.5.pod.in | 6 +++--- + tools/libs/light/libxl_dm.c | 19 ++++++++++++++++++- + tools/libs/light/libxl_types_internal.idl | 10 ++++++++++ + 3 files changed, 31 insertions(+), 4 deletions(-) + +diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in +index af7fae7c52f9..ef9505f91341 100644 +--- a/docs/man/xl.cfg.5.pod.in ++++ b/docs/man/xl.cfg.5.pod.in +@@ -2523,9 +2523,9 @@ The form serial=DEVICE is also accepted for backwards compatibility. + + =item B<soundhw="DEVICE"> + +-Select the virtual sound card to expose to the guest. The valid +-devices are defined by the device model configuration, please see the +-B<qemu(1)> manpage for details. The default is not to export any sound ++Select the virtual sound card to expose to the guest. The valid devices are ++B<hda>, B<ac97>, B<es1370>, B<adlib>, B<cs4231a>, B<gus>, B<sb16> if there are ++available with the device model QEMU. The default is not to export any sound + device. + + =item B<vkb_device=BOOLEAN> +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index ae5f35e0c3fd..b86e8ccc858f 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -1204,6 +1204,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + uint64_t ram_size; + const char *path, *chardev; + bool is_stubdom = libxl_defbool_val(b_info->device_model_stubdomain); ++ int rc; + + dm_args = flexarray_make(gc, 16, 1); + dm_envs = flexarray_make(gc, 16, 1); +@@ -1531,7 +1532,23 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + } + } + if (b_info->u.hvm.soundhw) { +- flexarray_vappend(dm_args, "-soundhw", b_info->u.hvm.soundhw, NULL); ++ libxl__qemu_soundhw soundhw; ++ ++ rc = libxl__qemu_soundhw_from_string(b_info->u.hvm.soundhw, &soundhw); ++ if (rc) { ++ LOGD(ERROR, guest_domid, "Unknown soundhw option '%s'", b_info->u.hvm.soundhw); ++ return ERROR_INVAL; ++ } ++ ++ switch (soundhw) { ++ case LIBXL__QEMU_SOUNDHW_HDA: ++ flexarray_vappend(dm_args, "-device", "intel-hda", ++ "-device", "hda-duplex", NULL); ++ break; ++ default: ++ flexarray_append_pair(dm_args, "-device", ++ (char*)libxl__qemu_soundhw_to_string(soundhw)); ++ } + } + if (!libxl__acpi_defbool_val(b_info)) { + flexarray_append(dm_args, "-no-acpi"); +diff --git a/tools/libs/light/libxl_types_internal.idl b/tools/libs/light/libxl_types_internal.idl +index 3593e21dbb64..caa08d3229cd 100644 +--- a/tools/libs/light/libxl_types_internal.idl ++++ b/tools/libs/light/libxl_types_internal.idl +@@ -55,3 +55,13 @@ libxl__device_action = Enumeration("device_action", [ + (1, "ADD"), + (2, "REMOVE"), + ]) ++ ++libxl__qemu_soundhw = Enumeration("qemu_soundhw", [ ++ (1, "ac97"), ++ (2, "adlib"), ++ (3, "cs4231a"), ++ (4, "es1370"), ++ (5, "gus"), ++ (6, "hda"), ++ (7, "sb16"), ++ ]) +-- +2.37.3 + diff --git a/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch b/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch new file mode 100644 index 0000000..9b1cce8 --- /dev/null +++ b/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch @@ -0,0 +1,44 @@ +From 7923ea47e578bca30a6e45951a9da09e827ff028 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:14:05 +0200 +Subject: [PATCH 57/67] x86/CPUID: surface suitable value in EBX of XSTATE + subleaf 1 + +While the SDM isn't very clear about this, our present behavior make +Linux 5.19 unhappy. As of commit 8ad7e8f69695 ("x86/fpu/xsave: Support +XSAVEC in the kernel") they're using this CPUID output also to size +the compacted area used by XSAVEC. Getting back zero there isn't really +liked, yet for PV that's the default on capable hardware: XSAVES isn't +exposed to PV domains. + +Considering that the size reported is that of the compacted save area, +I view Linux'es assumption as appropriate (short of the SDM properly +considering the case). Therefore we need to populate the field also when +only XSAVEC is supported for a guest. + +Fixes: 460b9a4b3630 ("x86/xsaves: enable xsaves/xrstors for hvm guest") +Fixes: 8d050ed1097c ("x86: don't expose XSAVES capability to PV guests") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: c3bd0b83ea5b7c0da6542687436042eeea1e7909 +master date: 2022-08-24 14:23:59 +0200 +--- + xen/arch/x86/cpuid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c +index ee2c4ea03a89..11c95178f110 100644 +--- a/xen/arch/x86/cpuid.c ++++ b/xen/arch/x86/cpuid.c +@@ -1052,7 +1052,7 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf, + switch ( subleaf ) + { + case 1: +- if ( p->xstate.xsaves ) ++ if ( p->xstate.xsavec || p->xstate.xsaves ) + { + /* + * TODO: Figure out what to do for XSS state. VT-x manages +-- +2.37.3 + diff --git a/0058-xen-sched-introduce-cpupool_update_node_affinity.patch b/0058-xen-sched-introduce-cpupool_update_node_affinity.patch new file mode 100644 index 0000000..c15edb8 --- /dev/null +++ b/0058-xen-sched-introduce-cpupool_update_node_affinity.patch @@ -0,0 +1,257 @@ +From 735b10844489babf52d3193193285a7311cf2c39 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:14:22 +0200 +Subject: [PATCH 58/67] xen/sched: introduce cpupool_update_node_affinity() + +For updating the node affinities of all domains in a cpupool add a new +function cpupool_update_node_affinity(). + +In order to avoid multiple allocations of cpumasks carve out memory +allocation and freeing from domain_update_node_affinity() into new +helpers, which can be used by cpupool_update_node_affinity(). + +Modify domain_update_node_affinity() to take an additional parameter +for passing the allocated memory in and to allocate and free the memory +via the new helpers in case NULL was passed. + +This will help later to pre-allocate the cpumasks in order to avoid +allocations in stop-machine context. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: a83fa1e2b96ace65b45dde6954d67012633a082b +master date: 2022-09-05 11:42:30 +0100 +--- + xen/common/sched/core.c | 54 ++++++++++++++++++++++++++------------ + xen/common/sched/cpupool.c | 39 +++++++++++++++------------ + xen/common/sched/private.h | 7 +++++ + xen/include/xen/sched.h | 9 ++++++- + 4 files changed, 74 insertions(+), 35 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index f07bd2681fcb..065a83eca912 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -1824,9 +1824,28 @@ int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, + return ret; + } + +-void domain_update_node_affinity(struct domain *d) ++bool alloc_affinity_masks(struct affinity_masks *affinity) + { +- cpumask_var_t dom_cpumask, dom_cpumask_soft; ++ if ( !alloc_cpumask_var(&affinity->hard) ) ++ return false; ++ if ( !alloc_cpumask_var(&affinity->soft) ) ++ { ++ free_cpumask_var(affinity->hard); ++ return false; ++ } ++ ++ return true; ++} ++ ++void free_affinity_masks(struct affinity_masks *affinity) ++{ ++ free_cpumask_var(affinity->soft); ++ free_cpumask_var(affinity->hard); ++} ++ ++void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity) ++{ ++ struct affinity_masks masks; + cpumask_t *dom_affinity; + const cpumask_t *online; + struct sched_unit *unit; +@@ -1836,14 +1855,16 @@ void domain_update_node_affinity(struct domain *d) + if ( !d->vcpu || !d->vcpu[0] ) + return; + +- if ( !zalloc_cpumask_var(&dom_cpumask) ) +- return; +- if ( !zalloc_cpumask_var(&dom_cpumask_soft) ) ++ if ( !affinity ) + { +- free_cpumask_var(dom_cpumask); +- return; ++ affinity = &masks; ++ if ( !alloc_affinity_masks(affinity) ) ++ return; + } + ++ cpumask_clear(affinity->hard); ++ cpumask_clear(affinity->soft); ++ + online = cpupool_domain_master_cpumask(d); + + spin_lock(&d->node_affinity_lock); +@@ -1864,22 +1885,21 @@ void domain_update_node_affinity(struct domain *d) + */ + for_each_sched_unit ( d, unit ) + { +- cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity); +- cpumask_or(dom_cpumask_soft, dom_cpumask_soft, +- unit->cpu_soft_affinity); ++ cpumask_or(affinity->hard, affinity->hard, unit->cpu_hard_affinity); ++ cpumask_or(affinity->soft, affinity->soft, unit->cpu_soft_affinity); + } + /* Filter out non-online cpus */ +- cpumask_and(dom_cpumask, dom_cpumask, online); +- ASSERT(!cpumask_empty(dom_cpumask)); ++ cpumask_and(affinity->hard, affinity->hard, online); ++ ASSERT(!cpumask_empty(affinity->hard)); + /* And compute the intersection between hard, online and soft */ +- cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask); ++ cpumask_and(affinity->soft, affinity->soft, affinity->hard); + + /* + * If not empty, the intersection of hard, soft and online is the + * narrowest set we want. If empty, we fall back to hard&online. + */ +- dom_affinity = cpumask_empty(dom_cpumask_soft) ? +- dom_cpumask : dom_cpumask_soft; ++ dom_affinity = cpumask_empty(affinity->soft) ? affinity->hard ++ : affinity->soft; + + nodes_clear(d->node_affinity); + for_each_cpu ( cpu, dom_affinity ) +@@ -1888,8 +1908,8 @@ void domain_update_node_affinity(struct domain *d) + + spin_unlock(&d->node_affinity_lock); + +- free_cpumask_var(dom_cpumask_soft); +- free_cpumask_var(dom_cpumask); ++ if ( affinity == &masks ) ++ free_affinity_masks(affinity); + } + + typedef long ret_t; +diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c +index 8c6e6eb9ccd5..45b6ff99561a 100644 +--- a/xen/common/sched/cpupool.c ++++ b/xen/common/sched/cpupool.c +@@ -401,6 +401,25 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) + return ret; + } + ++/* Update affinities of all domains in a cpupool. */ ++static void cpupool_update_node_affinity(const struct cpupool *c) ++{ ++ struct affinity_masks masks; ++ struct domain *d; ++ ++ if ( !alloc_affinity_masks(&masks) ) ++ return; ++ ++ rcu_read_lock(&domlist_read_lock); ++ ++ for_each_domain_in_cpupool(d, c) ++ domain_update_node_aff(d, &masks); ++ ++ rcu_read_unlock(&domlist_read_lock); ++ ++ free_affinity_masks(&masks); ++} ++ + /* + * assign a specific cpu to a cpupool + * cpupool_lock must be held +@@ -408,7 +427,6 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) + static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) + { + int ret; +- struct domain *d; + const cpumask_t *cpus; + + cpus = sched_get_opt_cpumask(c->gran, cpu); +@@ -433,12 +451,7 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) + + rcu_read_unlock(&sched_res_rculock); + +- rcu_read_lock(&domlist_read_lock); +- for_each_domain_in_cpupool(d, c) +- { +- domain_update_node_affinity(d); +- } +- rcu_read_unlock(&domlist_read_lock); ++ cpupool_update_node_affinity(c); + + return 0; + } +@@ -447,18 +460,14 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + { + int cpu = cpupool_moving_cpu; + const cpumask_t *cpus; +- struct domain *d; + int ret; + + if ( c != cpupool_cpu_moving ) + return -EADDRNOTAVAIL; + +- /* +- * We need this for scanning the domain list, both in +- * cpu_disable_scheduler(), and at the bottom of this function. +- */ + rcu_read_lock(&domlist_read_lock); + ret = cpu_disable_scheduler(cpu); ++ rcu_read_unlock(&domlist_read_lock); + + rcu_read_lock(&sched_res_rculock); + cpus = get_sched_res(cpu)->cpus; +@@ -485,11 +494,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + } + rcu_read_unlock(&sched_res_rculock); + +- for_each_domain_in_cpupool(d, c) +- { +- domain_update_node_affinity(d); +- } +- rcu_read_unlock(&domlist_read_lock); ++ cpupool_update_node_affinity(c); + + return ret; + } +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index 92d0d4961063..6e036f8c8077 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -593,6 +593,13 @@ affinity_balance_cpumask(const struct sched_unit *unit, int step, + cpumask_copy(mask, unit->cpu_hard_affinity); + } + ++struct affinity_masks { ++ cpumask_var_t hard; ++ cpumask_var_t soft; ++}; ++ ++bool alloc_affinity_masks(struct affinity_masks *affinity); ++void free_affinity_masks(struct affinity_masks *affinity); + void sched_rm_cpu(unsigned int cpu); + const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); + void schedule_dump(struct cpupool *c); +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 701963f84cb8..4e25627d9685 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -649,8 +649,15 @@ static inline void get_knownalive_domain(struct domain *d) + ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); + } + ++struct affinity_masks; ++ + int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); +-void domain_update_node_affinity(struct domain *d); ++void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity); ++ ++static inline void domain_update_node_affinity(struct domain *d) ++{ ++ domain_update_node_aff(d, NULL); ++} + + /* + * To be implemented by each architecture, sanity checking the configuration +-- +2.37.3 + diff --git a/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch b/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch new file mode 100644 index 0000000..587eef7 --- /dev/null +++ b/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch @@ -0,0 +1,263 @@ +From d638c2085f71f694344b34e70eb1b371c86b00f0 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:15:14 +0200 +Subject: [PATCH 59/67] xen/sched: carve out memory allocation and freeing from + schedule_cpu_rm() + +In order to prepare not allocating or freeing memory from +schedule_cpu_rm(), move this functionality to dedicated functions. + +For now call those functions from schedule_cpu_rm(). + +No change of behavior expected. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: d42be6f83480b3ada286dc18444331a816be88a3 +master date: 2022-09-05 11:42:30 +0100 +--- + xen/common/sched/core.c | 143 ++++++++++++++++++++++--------------- + xen/common/sched/private.h | 11 +++ + 2 files changed, 98 insertions(+), 56 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 065a83eca912..2decb1161a63 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -3221,6 +3221,75 @@ out: + return ret; + } + ++/* ++ * Allocate all memory needed for free_cpu_rm_data(), as allocations cannot ++ * be made in stop_machine() context. ++ * ++ * Between alloc_cpu_rm_data() and the real cpu removal action the relevant ++ * contents of struct sched_resource can't change, as the cpu in question is ++ * locked against any other movement to or from cpupools, and the data copied ++ * by alloc_cpu_rm_data() is modified only in case the cpu in question is ++ * being moved from or to a cpupool. ++ */ ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) ++{ ++ struct cpu_rm_data *data; ++ const struct sched_resource *sr; ++ unsigned int idx; ++ ++ rcu_read_lock(&sched_res_rculock); ++ ++ sr = get_sched_res(cpu); ++ data = xmalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1); ++ if ( !data ) ++ goto out; ++ ++ data->old_ops = sr->scheduler; ++ data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; ++ data->ppriv_old = sr->sched_priv; ++ ++ for ( idx = 0; idx < sr->granularity - 1; idx++ ) ++ { ++ data->sr[idx] = sched_alloc_res(); ++ if ( data->sr[idx] ) ++ { ++ data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem(); ++ if ( !data->sr[idx]->sched_unit_idle ) ++ { ++ sched_res_free(&data->sr[idx]->rcu); ++ data->sr[idx] = NULL; ++ } ++ } ++ if ( !data->sr[idx] ) ++ { ++ while ( idx > 0 ) ++ sched_res_free(&data->sr[--idx]->rcu); ++ XFREE(data); ++ goto out; ++ } ++ ++ data->sr[idx]->curr = data->sr[idx]->sched_unit_idle; ++ data->sr[idx]->scheduler = &sched_idle_ops; ++ data->sr[idx]->granularity = 1; ++ ++ /* We want the lock not to change when replacing the resource. */ ++ data->sr[idx]->schedule_lock = sr->schedule_lock; ++ } ++ ++ out: ++ rcu_read_unlock(&sched_res_rculock); ++ ++ return data; ++} ++ ++void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) ++{ ++ sched_free_udata(mem->old_ops, mem->vpriv_old); ++ sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); ++ ++ xfree(mem); ++} ++ + /* + * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops + * (the idle scheduler). +@@ -3229,53 +3298,23 @@ out: + */ + int schedule_cpu_rm(unsigned int cpu) + { +- void *ppriv_old, *vpriv_old; +- struct sched_resource *sr, **sr_new = NULL; ++ struct sched_resource *sr; ++ struct cpu_rm_data *data; + struct sched_unit *unit; +- struct scheduler *old_ops; + spinlock_t *old_lock; + unsigned long flags; +- int idx, ret = -ENOMEM; ++ int idx = 0; + unsigned int cpu_iter; + ++ data = alloc_cpu_rm_data(cpu); ++ if ( !data ) ++ return -ENOMEM; ++ + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); +- old_ops = sr->scheduler; + +- if ( sr->granularity > 1 ) +- { +- sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1); +- if ( !sr_new ) +- goto out; +- for ( idx = 0; idx < sr->granularity - 1; idx++ ) +- { +- sr_new[idx] = sched_alloc_res(); +- if ( sr_new[idx] ) +- { +- sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem(); +- if ( !sr_new[idx]->sched_unit_idle ) +- { +- sched_res_free(&sr_new[idx]->rcu); +- sr_new[idx] = NULL; +- } +- } +- if ( !sr_new[idx] ) +- { +- for ( idx--; idx >= 0; idx-- ) +- sched_res_free(&sr_new[idx]->rcu); +- goto out; +- } +- sr_new[idx]->curr = sr_new[idx]->sched_unit_idle; +- sr_new[idx]->scheduler = &sched_idle_ops; +- sr_new[idx]->granularity = 1; +- +- /* We want the lock not to change when replacing the resource. */ +- sr_new[idx]->schedule_lock = sr->schedule_lock; +- } +- } +- +- ret = 0; ++ ASSERT(sr->granularity); + ASSERT(sr->cpupool != NULL); + ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); + ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); +@@ -3283,10 +3322,6 @@ int schedule_cpu_rm(unsigned int cpu) + /* See comment in schedule_cpu_add() regarding lock switching. */ + old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); + +- vpriv_old = idle_vcpu[cpu]->sched_unit->priv; +- ppriv_old = sr->sched_priv; +- +- idx = 0; + for_each_cpu ( cpu_iter, sr->cpus ) + { + per_cpu(sched_res_idx, cpu_iter) = 0; +@@ -3300,27 +3335,27 @@ int schedule_cpu_rm(unsigned int cpu) + else + { + /* Initialize unit. */ +- unit = sr_new[idx]->sched_unit_idle; +- unit->res = sr_new[idx]; ++ unit = data->sr[idx]->sched_unit_idle; ++ unit->res = data->sr[idx]; + unit->is_running = true; + sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); + sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); + + /* Adjust cpu masks of resources (old and new). */ + cpumask_clear_cpu(cpu_iter, sr->cpus); +- cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus); ++ cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus); + cpumask_set_cpu(cpu_iter, &sched_res_mask); + + /* Init timer. */ +- init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter); ++ init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter); + + /* Last resource initializations and insert resource pointer. */ +- sr_new[idx]->master_cpu = cpu_iter; +- set_sched_res(cpu_iter, sr_new[idx]); ++ data->sr[idx]->master_cpu = cpu_iter; ++ set_sched_res(cpu_iter, data->sr[idx]); + + /* Last action: set the new lock pointer. */ + smp_mb(); +- sr_new[idx]->schedule_lock = &sched_free_cpu_lock; ++ data->sr[idx]->schedule_lock = &sched_free_cpu_lock; + + idx++; + } +@@ -3336,16 +3371,12 @@ int schedule_cpu_rm(unsigned int cpu) + /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ + spin_unlock_irqrestore(old_lock, flags); + +- sched_deinit_pdata(old_ops, ppriv_old, cpu); ++ sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); + +- sched_free_udata(old_ops, vpriv_old); +- sched_free_pdata(old_ops, ppriv_old, cpu); +- +-out: + rcu_read_unlock(&sched_res_rculock); +- xfree(sr_new); ++ free_cpu_rm_data(data, cpu); + +- return ret; ++ return 0; + } + + struct scheduler *scheduler_get_default(void) +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index 6e036f8c8077..ff3185425219 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -600,6 +600,15 @@ struct affinity_masks { + + bool alloc_affinity_masks(struct affinity_masks *affinity); + void free_affinity_masks(struct affinity_masks *affinity); ++ ++/* Memory allocation related data for schedule_cpu_rm(). */ ++struct cpu_rm_data { ++ const struct scheduler *old_ops; ++ void *ppriv_old; ++ void *vpriv_old; ++ struct sched_resource *sr[]; ++}; ++ + void sched_rm_cpu(unsigned int cpu); + const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); + void schedule_dump(struct cpupool *c); +@@ -608,6 +617,8 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); + void scheduler_free(struct scheduler *sched); + int cpu_disable_scheduler(unsigned int cpu); + int schedule_cpu_add(unsigned int cpu, struct cpupool *c); ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); ++void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); + int schedule_cpu_rm(unsigned int cpu); + int sched_move_domain(struct domain *d, struct cpupool *c); + struct cpupool *cpupool_get_by_id(unsigned int poolid); +-- +2.37.3 + diff --git a/0060-xen-sched-fix-cpu-hotplug.patch b/0060-xen-sched-fix-cpu-hotplug.patch new file mode 100644 index 0000000..3e158f4 --- /dev/null +++ b/0060-xen-sched-fix-cpu-hotplug.patch @@ -0,0 +1,307 @@ +From d17680808b4c8015e31070c971e1ee548170ae34 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:15:41 +0200 +Subject: [PATCH 60/67] xen/sched: fix cpu hotplug + +Cpu unplugging is calling schedule_cpu_rm() via stop_machine_run() with +interrupts disabled, thus any memory allocation or freeing must be +avoided. + +Since commit 5047cd1d5dea ("xen/common: Use enhanced +ASSERT_ALLOC_CONTEXT in xmalloc()") this restriction is being enforced +via an assertion, which will now fail. + +Fix this by allocating needed memory before entering stop_machine_run() +and freeing any memory only after having finished stop_machine_run(). + +Fixes: 1ec410112cdd ("xen/sched: support differing granularity in schedule_cpu_[add/rm]()") +Reported-by: Gao Ruifeng <ruifeng.gao@intel.com> +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: d84473689611eed32fd90b27e614f28af767fa3f +master date: 2022-09-05 11:42:30 +0100 +--- + xen/common/sched/core.c | 25 +++++++++++--- + xen/common/sched/cpupool.c | 69 +++++++++++++++++++++++++++++--------- + xen/common/sched/private.h | 5 +-- + 3 files changed, 77 insertions(+), 22 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 2decb1161a63..900aab8f66a7 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -3231,7 +3231,7 @@ out: + * by alloc_cpu_rm_data() is modified only in case the cpu in question is + * being moved from or to a cpupool. + */ +-struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc) + { + struct cpu_rm_data *data; + const struct sched_resource *sr; +@@ -3244,6 +3244,17 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) + if ( !data ) + goto out; + ++ if ( aff_alloc ) ++ { ++ if ( !alloc_affinity_masks(&data->affinity) ) ++ { ++ XFREE(data); ++ goto out; ++ } ++ } ++ else ++ memset(&data->affinity, 0, sizeof(data->affinity)); ++ + data->old_ops = sr->scheduler; + data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; + data->ppriv_old = sr->sched_priv; +@@ -3264,6 +3275,7 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) + { + while ( idx > 0 ) + sched_res_free(&data->sr[--idx]->rcu); ++ free_affinity_masks(&data->affinity); + XFREE(data); + goto out; + } +@@ -3286,6 +3298,7 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) + { + sched_free_udata(mem->old_ops, mem->vpriv_old); + sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); ++ free_affinity_masks(&mem->affinity); + + xfree(mem); + } +@@ -3296,17 +3309,18 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) + * The cpu is already marked as "free" and not valid any longer for its + * cpupool. + */ +-int schedule_cpu_rm(unsigned int cpu) ++int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *data) + { + struct sched_resource *sr; +- struct cpu_rm_data *data; + struct sched_unit *unit; + spinlock_t *old_lock; + unsigned long flags; + int idx = 0; + unsigned int cpu_iter; ++ bool free_data = !data; + +- data = alloc_cpu_rm_data(cpu); ++ if ( !data ) ++ data = alloc_cpu_rm_data(cpu, false); + if ( !data ) + return -ENOMEM; + +@@ -3374,7 +3388,8 @@ int schedule_cpu_rm(unsigned int cpu) + sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); + + rcu_read_unlock(&sched_res_rculock); +- free_cpu_rm_data(data, cpu); ++ if ( free_data ) ++ free_cpu_rm_data(data, cpu); + + return 0; + } +diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c +index 45b6ff99561a..b5a948639aad 100644 +--- a/xen/common/sched/cpupool.c ++++ b/xen/common/sched/cpupool.c +@@ -402,22 +402,28 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) + } + + /* Update affinities of all domains in a cpupool. */ +-static void cpupool_update_node_affinity(const struct cpupool *c) ++static void cpupool_update_node_affinity(const struct cpupool *c, ++ struct affinity_masks *masks) + { +- struct affinity_masks masks; ++ struct affinity_masks local_masks; + struct domain *d; + +- if ( !alloc_affinity_masks(&masks) ) +- return; ++ if ( !masks ) ++ { ++ if ( !alloc_affinity_masks(&local_masks) ) ++ return; ++ masks = &local_masks; ++ } + + rcu_read_lock(&domlist_read_lock); + + for_each_domain_in_cpupool(d, c) +- domain_update_node_aff(d, &masks); ++ domain_update_node_aff(d, masks); + + rcu_read_unlock(&domlist_read_lock); + +- free_affinity_masks(&masks); ++ if ( masks == &local_masks ) ++ free_affinity_masks(masks); + } + + /* +@@ -451,15 +457,17 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) + + rcu_read_unlock(&sched_res_rculock); + +- cpupool_update_node_affinity(c); ++ cpupool_update_node_affinity(c, NULL); + + return 0; + } + +-static int cpupool_unassign_cpu_finish(struct cpupool *c) ++static int cpupool_unassign_cpu_finish(struct cpupool *c, ++ struct cpu_rm_data *mem) + { + int cpu = cpupool_moving_cpu; + const cpumask_t *cpus; ++ struct affinity_masks *masks = mem ? &mem->affinity : NULL; + int ret; + + if ( c != cpupool_cpu_moving ) +@@ -482,7 +490,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + */ + if ( !ret ) + { +- ret = schedule_cpu_rm(cpu); ++ ret = schedule_cpu_rm(cpu, mem); + if ( ret ) + cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); + else +@@ -494,7 +502,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + } + rcu_read_unlock(&sched_res_rculock); + +- cpupool_update_node_affinity(c); ++ cpupool_update_node_affinity(c, masks); + + return ret; + } +@@ -558,7 +566,7 @@ static long cpupool_unassign_cpu_helper(void *info) + cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); + spin_lock(&cpupool_lock); + +- ret = cpupool_unassign_cpu_finish(c); ++ ret = cpupool_unassign_cpu_finish(c, NULL); + + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); +@@ -701,7 +709,7 @@ static int cpupool_cpu_add(unsigned int cpu) + * This function is called in stop_machine context, so we can be sure no + * non-idle vcpu is active on the system. + */ +-static void cpupool_cpu_remove(unsigned int cpu) ++static void cpupool_cpu_remove(unsigned int cpu, struct cpu_rm_data *mem) + { + int ret; + +@@ -709,7 +717,7 @@ static void cpupool_cpu_remove(unsigned int cpu) + + if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) + { +- ret = cpupool_unassign_cpu_finish(cpupool0); ++ ret = cpupool_unassign_cpu_finish(cpupool0, mem); + BUG_ON(ret); + } + cpumask_clear_cpu(cpu, &cpupool_free_cpus); +@@ -775,7 +783,7 @@ static void cpupool_cpu_remove_forced(unsigned int cpu) + { + ret = cpupool_unassign_cpu_start(c, master_cpu); + BUG_ON(ret); +- ret = cpupool_unassign_cpu_finish(c); ++ ret = cpupool_unassign_cpu_finish(c, NULL); + BUG_ON(ret); + } + } +@@ -993,12 +1001,24 @@ void dump_runq(unsigned char key) + static int cpu_callback( + struct notifier_block *nfb, unsigned long action, void *hcpu) + { ++ static struct cpu_rm_data *mem; ++ + unsigned int cpu = (unsigned long)hcpu; + int rc = 0; + + switch ( action ) + { + case CPU_DOWN_FAILED: ++ if ( system_state <= SYS_STATE_active ) ++ { ++ if ( mem ) ++ { ++ free_cpu_rm_data(mem, cpu); ++ mem = NULL; ++ } ++ rc = cpupool_cpu_add(cpu); ++ } ++ break; + case CPU_ONLINE: + if ( system_state <= SYS_STATE_active ) + rc = cpupool_cpu_add(cpu); +@@ -1006,12 +1026,31 @@ static int cpu_callback( + case CPU_DOWN_PREPARE: + /* Suspend/Resume don't change assignments of cpus to cpupools. */ + if ( system_state <= SYS_STATE_active ) ++ { + rc = cpupool_cpu_remove_prologue(cpu); ++ if ( !rc ) ++ { ++ ASSERT(!mem); ++ mem = alloc_cpu_rm_data(cpu, true); ++ rc = mem ? 0 : -ENOMEM; ++ } ++ } + break; + case CPU_DYING: + /* Suspend/Resume don't change assignments of cpus to cpupools. */ + if ( system_state <= SYS_STATE_active ) +- cpupool_cpu_remove(cpu); ++ { ++ ASSERT(mem); ++ cpupool_cpu_remove(cpu, mem); ++ } ++ break; ++ case CPU_DEAD: ++ if ( system_state <= SYS_STATE_active ) ++ { ++ ASSERT(mem); ++ free_cpu_rm_data(mem, cpu); ++ mem = NULL; ++ } + break; + case CPU_RESUME_FAILED: + cpupool_cpu_remove_forced(cpu); +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index ff3185425219..3bab78ccb240 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -603,6 +603,7 @@ void free_affinity_masks(struct affinity_masks *affinity); + + /* Memory allocation related data for schedule_cpu_rm(). */ + struct cpu_rm_data { ++ struct affinity_masks affinity; + const struct scheduler *old_ops; + void *ppriv_old; + void *vpriv_old; +@@ -617,9 +618,9 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); + void scheduler_free(struct scheduler *sched); + int cpu_disable_scheduler(unsigned int cpu); + int schedule_cpu_add(unsigned int cpu, struct cpupool *c); +-struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc); + void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); +-int schedule_cpu_rm(unsigned int cpu); ++int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *mem); + int sched_move_domain(struct domain *d, struct cpupool *c); + struct cpupool *cpupool_get_by_id(unsigned int poolid); + void cpupool_put(struct cpupool *pool); +-- +2.37.3 + diff --git a/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch b/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch new file mode 100644 index 0000000..0f044b2 --- /dev/null +++ b/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch @@ -0,0 +1,58 @@ +From 19cf28b515f21da02df80e68f901ad7650daaa37 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:15:55 +0200 +Subject: [PATCH 61/67] Config.mk: correct PIE-related option(s) in + EMBEDDED_EXTRA_CFLAGS + +I haven't been able to find evidence of "-nopie" ever having been a +supported compiler option. The correct spelling is "-no-pie". +Furthermore like "-pie" this is an option which is solely passed to the +linker. The compiler only recognizes "-fpie" / "-fPIE" / "-fno-pie", and +it doesn't infer these options from "-pie" / "-no-pie". + +Add the compiler recognized form, but for the possible case of the +variable also being used somewhere for linking keep the linker option as +well (with corrected spelling). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> + +Build: Drop -no-pie from EMBEDDED_EXTRA_CFLAGS + +This breaks all Clang builds, as demostrated by Gitlab CI. + +Contrary to the description in ecd6b9759919, -no-pie is not even an option +passed to the linker. GCC's actual behaviour is to inhibit the passing of +-pie to the linker, as well as selecting different cr0 artefacts to be linked. + +EMBEDDED_EXTRA_CFLAGS is not used for $(CC)-doing-linking, and not liable to +gain such a usecase. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +Tested-by: Stefano Stabellini <sstabellini@kernel.org> +Fixes: ecd6b9759919 ("Config.mk: correct PIE-related option(s) in EMBEDDED_EXTRA_CFLAGS") +master commit: ecd6b9759919fa6335b0be1b5fc5cce29a30c4f1 +master date: 2022-09-08 09:25:26 +0200 +master commit: 13a7c0074ac8fb31f6c0485429b7a20a1946cb22 +master date: 2022-09-27 15:40:42 -0700 +--- + Config.mk | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Config.mk b/Config.mk +index 96d89b2f7dfc..9f87608f6602 100644 +--- a/Config.mk ++++ b/Config.mk +@@ -203,7 +203,7 @@ endif + APPEND_LDFLAGS += $(foreach i, $(APPEND_LIB), -L$(i)) + APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i)) + +-EMBEDDED_EXTRA_CFLAGS := -nopie -fno-stack-protector -fno-stack-protector-all ++EMBEDDED_EXTRA_CFLAGS := -fno-pie -fno-stack-protector -fno-stack-protector-all + EMBEDDED_EXTRA_CFLAGS += -fno-exceptions -fno-asynchronous-unwind-tables + + XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles +-- +2.37.3 + diff --git a/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch b/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch new file mode 100644 index 0000000..65882a9 --- /dev/null +++ b/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch @@ -0,0 +1,41 @@ +From 182f8bb503b9dd3db5dd9118dc763d241787c6fc Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:16:09 +0200 +Subject: [PATCH 62/67] tools/xenstore: minor fix of the migration stream doc + +Drop mentioning the non-existent read-only socket in the migration +stream description document. + +The related record field was removed in commit 8868a0e3f674 ("docs: +update the xenstore migration stream documentation). + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +master commit: ace1d2eff80d3d66c37ae765dae3e3cb5697e5a4 +master date: 2022-09-08 09:25:58 +0200 +--- + docs/designs/xenstore-migration.md | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/docs/designs/xenstore-migration.md b/docs/designs/xenstore-migration.md +index 5f1155273ec3..78530bbb0ef4 100644 +--- a/docs/designs/xenstore-migration.md ++++ b/docs/designs/xenstore-migration.md +@@ -129,11 +129,9 @@ xenstored state that needs to be restored. + | `evtchn-fd` | The file descriptor used to communicate with | + | | the event channel driver | + +-xenstored will resume in the original process context. Hence `rw-socket-fd` and +-`ro-socket-fd` simply specify the file descriptors of the sockets. Sockets +-are not always used, however, and so -1 will be used to denote an unused +-socket. +- ++xenstored will resume in the original process context. Hence `rw-socket-fd` ++simply specifies the file descriptor of the socket. Sockets are not always ++used, however, and so -1 will be used to denote an unused socket. + + \pagebreak + +-- +2.37.3 + diff --git a/0063-xen-gnttab-fix-gnttab_acquire_resource.patch b/0063-xen-gnttab-fix-gnttab_acquire_resource.patch new file mode 100644 index 0000000..0d58157 --- /dev/null +++ b/0063-xen-gnttab-fix-gnttab_acquire_resource.patch @@ -0,0 +1,69 @@ +From 3ac64b3751837a117ee3dfb3e2cc27057a83d0f7 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:16:53 +0200 +Subject: [PATCH 63/67] xen/gnttab: fix gnttab_acquire_resource() + +Commit 9dc46386d89d ("gnttab: work around "may be used uninitialized" +warning") was wrong, as vaddrs can legitimately be NULL in case +XENMEM_resource_grant_table_id_status was specified for a grant table +v1. This would result in crashes in debug builds due to +ASSERT_UNREACHABLE() triggering. + +Check vaddrs only to be NULL in the rc == 0 case. + +Expand the tests in tools/tests/resource to tickle this path, and verify that +using XENMEM_resource_grant_table_id_status on a v1 grant table fails. + +Fixes: 9dc46386d89d ("gnttab: work around "may be used uninitialized" warning") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> # xen +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 52daa6a8483e4fbd6757c9d1b791e23931791608 +master date: 2022-09-09 16:28:38 +0100 +--- + tools/tests/resource/test-resource.c | 15 +++++++++++++++ + xen/common/grant_table.c | 2 +- + 2 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c +index 1caaa60e62d9..bf485baff2b4 100644 +--- a/tools/tests/resource/test-resource.c ++++ b/tools/tests/resource/test-resource.c +@@ -63,6 +63,21 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames) + rc = xenforeignmemory_unmap_resource(fh, res); + if ( rc ) + return fail(" Fail: Unmap %d - %s\n", errno, strerror(errno)); ++ ++ /* ++ * Verify that an attempt to map the status frames fails, as the domain is ++ * in gnttab v1 mode. ++ */ ++ res = xenforeignmemory_map_resource( ++ fh, domid, XENMEM_resource_grant_table, ++ XENMEM_resource_grant_table_id_status, 0, 1, ++ (void **)&gnttab, PROT_READ | PROT_WRITE, 0); ++ ++ if ( res ) ++ { ++ fail(" Fail: Managed to map gnttab v2 status frames in v1 mode\n"); ++ xenforeignmemory_unmap_resource(fh, res); ++ } + } + + static void test_domain_configurations(void) +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 0523beb9b734..01e426c67fb6 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -4138,7 +4138,7 @@ int gnttab_acquire_resource( + * on non-error paths, and hence it needs setting to NULL at the top of the + * function. Leave some runtime safety. + */ +- if ( !vaddrs ) ++ if ( !rc && !vaddrs ) + { + ASSERT_UNREACHABLE(); + rc = -ENODATA; +-- +2.37.3 + diff --git a/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch b/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch new file mode 100644 index 0000000..4246b01 --- /dev/null +++ b/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch @@ -0,0 +1,59 @@ +From 62e534d17cdd838828bfd75d3d845e31524dd336 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:17:12 +0200 +Subject: [PATCH 64/67] x86: wire up VCPUOP_register_vcpu_time_memory_area for + 32-bit guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Forever sinced its introduction VCPUOP_register_vcpu_time_memory_area +was available only to native domains. Linux, for example, would attempt +to use it irrespective of guest bitness (including in its so called +PVHVM mode) as long as it finds XEN_PVCLOCK_TSC_STABLE_BIT set (which we +set only for clocksource=tsc, which in turn needs engaging via command +line option). + +Fixes: a5d39947cb89 ("Allow guests to register secondary vcpu_time_info") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: b726541d94bd0a80b5864d17a2cd2e6d73a3fe0a +master date: 2022-09-29 14:47:45 +0200 +--- + xen/arch/x86/x86_64/domain.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c +index c46dccc25a54..d51d99344796 100644 +--- a/xen/arch/x86/x86_64/domain.c ++++ b/xen/arch/x86/x86_64/domain.c +@@ -54,6 +54,26 @@ arch_compat_vcpu_op( + break; + } + ++ case VCPUOP_register_vcpu_time_memory_area: ++ { ++ struct compat_vcpu_register_time_memory_area area = { .addr.p = 0 }; ++ ++ rc = -EFAULT; ++ if ( copy_from_guest(&area.addr.h, arg, 1) ) ++ break; ++ ++ if ( area.addr.h.c != area.addr.p || ++ !compat_handle_okay(area.addr.h, 1) ) ++ break; ++ ++ rc = 0; ++ guest_from_compat_handle(v->arch.time_info_guest, area.addr.h); ++ ++ force_update_vcpu_system_time(v); ++ ++ break; ++ } ++ + case VCPUOP_get_physid: + rc = arch_do_vcpu_op(cmd, v, arg); + break; +-- +2.37.3 + diff --git a/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch b/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch new file mode 100644 index 0000000..df4fb38 --- /dev/null +++ b/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch @@ -0,0 +1,97 @@ +From 9690bb261d5fa09cb281e1fa124d93db7b84fda5 Mon Sep 17 00:00:00 2001 +From: Tamas K Lengyel <tamas.lengyel@intel.com> +Date: Tue, 11 Oct 2022 15:17:42 +0200 +Subject: [PATCH 65/67] x86/vpmu: Fix race-condition in vpmu_load + +The vPMU code-bases attempts to perform an optimization on saving/reloading the +PMU context by keeping track of what vCPU ran on each pCPU. When a pCPU is +getting scheduled, checks if the previous vCPU isn't the current one. If so, +attempts a call to vpmu_save_force. Unfortunately if the previous vCPU is +already getting scheduled to run on another pCPU its state will be already +runnable, which results in an ASSERT failure. + +Fix this by always performing a pmu context save in vpmu_save when called from +vpmu_switch_from, and do a vpmu_load when called from vpmu_switch_to. + +While this presents a minimal overhead in case the same vCPU is getting +rescheduled on the same pCPU, the ASSERT failure is avoided and the code is a +lot easier to reason about. + +Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +master commit: defa4e51d20a143bdd4395a075bf0933bb38a9a4 +master date: 2022-09-30 09:53:49 +0200 +--- + xen/arch/x86/cpu/vpmu.c | 42 ++++------------------------------------- + 1 file changed, 4 insertions(+), 38 deletions(-) + +diff --git a/xen/arch/x86/cpu/vpmu.c b/xen/arch/x86/cpu/vpmu.c +index fb1b296a6cc1..800eff87dc03 100644 +--- a/xen/arch/x86/cpu/vpmu.c ++++ b/xen/arch/x86/cpu/vpmu.c +@@ -364,58 +364,24 @@ void vpmu_save(struct vcpu *v) + vpmu->last_pcpu = pcpu; + per_cpu(last_vcpu, pcpu) = v; + ++ vpmu_set(vpmu, VPMU_CONTEXT_SAVE); ++ + if ( vpmu->arch_vpmu_ops ) + if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0) ) + vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); + ++ vpmu_reset(vpmu, VPMU_CONTEXT_SAVE); ++ + apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); + } + + int vpmu_load(struct vcpu *v, bool_t from_guest) + { + struct vpmu_struct *vpmu = vcpu_vpmu(v); +- int pcpu = smp_processor_id(); +- struct vcpu *prev = NULL; + + if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) + return 0; + +- /* First time this VCPU is running here */ +- if ( vpmu->last_pcpu != pcpu ) +- { +- /* +- * Get the context from last pcpu that we ran on. Note that if another +- * VCPU is running there it must have saved this VPCU's context before +- * startig to run (see below). +- * There should be no race since remote pcpu will disable interrupts +- * before saving the context. +- */ +- if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) +- { +- on_selected_cpus(cpumask_of(vpmu->last_pcpu), +- vpmu_save_force, (void *)v, 1); +- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); +- } +- } +- +- /* Prevent forced context save from remote CPU */ +- local_irq_disable(); +- +- prev = per_cpu(last_vcpu, pcpu); +- +- if ( prev != v && prev ) +- { +- vpmu = vcpu_vpmu(prev); +- +- /* Someone ran here before us */ +- vpmu_save_force(prev); +- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); +- +- vpmu = vcpu_vpmu(v); +- } +- +- local_irq_enable(); +- + /* Only when PMU is counting, we load PMU context immediately. */ + if ( !vpmu_is_set(vpmu, VPMU_RUNNING) || + (!has_vlapic(vpmu_vcpu(vpmu)->domain) && +-- +2.37.3 + diff --git a/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch b/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch new file mode 100644 index 0000000..24b9576 --- /dev/null +++ b/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch @@ -0,0 +1,31 @@ +From 0d233924d4b0f676056856096e8761205add3ee8 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Wed, 12 Oct 2022 17:31:44 +0200 +Subject: [PATCH 66/67] tools/tests: fix wrong backport of upstream commit + 52daa6a8483e4 + +The backport of upstream commit 52daa6a8483e4 had a bug, correct it. + +Fixes: 3ac64b375183 ("xen/gnttab: fix gnttab_acquire_resource()") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + tools/tests/resource/test-resource.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c +index bf485baff2b4..51a8f4a000f6 100644 +--- a/tools/tests/resource/test-resource.c ++++ b/tools/tests/resource/test-resource.c +@@ -71,7 +71,7 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames) + res = xenforeignmemory_map_resource( + fh, domid, XENMEM_resource_grant_table, + XENMEM_resource_grant_table_id_status, 0, 1, +- (void **)&gnttab, PROT_READ | PROT_WRITE, 0); ++ &addr, PROT_READ | PROT_WRITE, 0); + + if ( res ) + { +-- +2.37.3 + diff --git a/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch b/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch new file mode 100644 index 0000000..309d486 --- /dev/null +++ b/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch @@ -0,0 +1,42 @@ +From 816580afdd1730d4f85f64477a242a439af1cdf8 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 12 Oct 2022 17:33:40 +0200 +Subject: [PATCH 67/67] libxl/Arm: correct xc_shadow_control() invocation to + fix build + +The backport didn't adapt to the earlier function prototype taking more +(unused here) arguments. + +Fixes: c5215044578e ("xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Henry Wang <Henry.Wang@arm.com> +Acked-by: Anthony PERARD <anthony.perard@citrix.com> +--- + tools/libs/light/libxl_arm.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c +index d21f614ed788..ba548befdd25 100644 +--- a/tools/libs/light/libxl_arm.c ++++ b/tools/libs/light/libxl_arm.c +@@ -132,14 +132,14 @@ int libxl__arch_domain_create(libxl__gc *gc, + uint32_t domid) + { + libxl_ctx *ctx = libxl__gc_owner(gc); +- unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); ++ unsigned long shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); + + int r = xc_shadow_control(ctx->xch, domid, + XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, +- &shadow_mb, 0); ++ NULL, 0, &shadow_mb, 0, NULL); + if (r) { + LOGED(ERROR, domid, +- "Failed to set %u MiB shadow allocation", shadow_mb); ++ "Failed to set %lu MiB shadow allocation", shadow_mb); + return ERROR_FAIL; + } + +-- +2.37.3 + @@ -1,6 +1,6 @@ -Xen upstream patchset #0.1 for 4.15.4-pre +Xen upstream patchset #1 for 4.15.4-pre Containing patches from RELEASE-4.15.3 (feecaf4abf733e83b7a297190819eca7a7f65168) to -staging-4.15 (35bf91d30f1a480dcf5bfd99b79384b2b283da7f) +staging-4.15 (816580afdd1730d4f85f64477a242a439af1cdf8) |