From 03c9445642841756b9c5fb6d5ff7873d0adcad7d Mon Sep 17 00:00:00 2001 From: Michael Marineau Date: Fri, 22 Feb 2008 23:48:42 +0000 Subject: Releasing 2.6.20-5 svn path=/patches/; revision=73 --- tags/2.6.20-5/00000_README | 63 + .../2.6.20-5/20950_linux-2.6.20.14-xen-3.1.0.patch | 93230 +++++++++++++++++++ ...nux-2.6-xen-x86_64-silence-up-apic-errors.patch | 13 + .../20957-linux-2.6-fix-x86_64-vgetcpu.patch | 65 + ...-linux-2.6-xen-iscsi-x86_64-no_iommu_init.patch | 45 + .../20960-linux-2.6-xen-blkfront-wait-add.patch | 82 + .../20961_linux-2.6-xen-backwards-time.patch | 63 + ...2_linux-2.6-xen-add-packet_auxdata-cmsg-1.patch | 233 + ...3_linux-2.6-xen-add-packet_auxdata-cmsg-2.patch | 144 + .../20965_linux-2.6-xen-sleazy-fpu-i386.patch | 96 + .../20966_linux-2.6-xen-sleazy-fpu-x86_64.patch | 93 + .../21665-linux-2.6-disable-netback-checksum.patch | 37 + .../23000-linux-2.6-acpi-config_pm-poweroff.patch | 38 + tags/2.6.20-5/26000_linux-2.6-cve-2008-0600.patch | 37 + .../30037_amd64-zero-extend-32bit-ptrace-xen.patch | 50 + ...i386-fix-xen_l1_entry_update-for-highptes.patch | 26 + tags/2.6.20-5/50001_make-install.patch | 52 + .../2.6.20-5/50002_always-enable-xen-genapic.patch | 12 + tags/2.6.20-5/50003_console-tty-fix.patch | 75 + tags/2.6.20-5/50004_quirks-no-smp-fix.patch | 12 + tags/2.6.20-5/50006_pgetable-build-fix.patch | 14 + tags/2.6.20-5/50008_reenable-tls-warning.patch | 23 + tags/2.6.20-5/50009_gentooify-tls-warning.patch | 16 + 23 files changed, 94519 insertions(+) create mode 100644 tags/2.6.20-5/00000_README create mode 100644 tags/2.6.20-5/20950_linux-2.6.20.14-xen-3.1.0.patch create mode 100644 tags/2.6.20-5/20952-linux-2.6-xen-x86_64-silence-up-apic-errors.patch create mode 100644 tags/2.6.20-5/20957-linux-2.6-fix-x86_64-vgetcpu.patch create mode 100644 tags/2.6.20-5/20958-linux-2.6-xen-iscsi-x86_64-no_iommu_init.patch create mode 100644 tags/2.6.20-5/20960-linux-2.6-xen-blkfront-wait-add.patch create mode 100644 tags/2.6.20-5/20961_linux-2.6-xen-backwards-time.patch create mode 100644 tags/2.6.20-5/20962_linux-2.6-xen-add-packet_auxdata-cmsg-1.patch create mode 100644 tags/2.6.20-5/20963_linux-2.6-xen-add-packet_auxdata-cmsg-2.patch create mode 100644 tags/2.6.20-5/20965_linux-2.6-xen-sleazy-fpu-i386.patch create mode 100644 tags/2.6.20-5/20966_linux-2.6-xen-sleazy-fpu-x86_64.patch create mode 100644 tags/2.6.20-5/21665-linux-2.6-disable-netback-checksum.patch create mode 100644 tags/2.6.20-5/23000-linux-2.6-acpi-config_pm-poweroff.patch create mode 100644 tags/2.6.20-5/26000_linux-2.6-cve-2008-0600.patch create mode 100644 tags/2.6.20-5/30037_amd64-zero-extend-32bit-ptrace-xen.patch create mode 100644 tags/2.6.20-5/40001_i386-fix-xen_l1_entry_update-for-highptes.patch create mode 100644 tags/2.6.20-5/50001_make-install.patch create mode 100644 tags/2.6.20-5/50002_always-enable-xen-genapic.patch create mode 100644 tags/2.6.20-5/50003_console-tty-fix.patch create mode 100644 tags/2.6.20-5/50004_quirks-no-smp-fix.patch create mode 100644 tags/2.6.20-5/50006_pgetable-build-fix.patch create mode 100644 tags/2.6.20-5/50008_reenable-tls-warning.patch create mode 100644 tags/2.6.20-5/50009_gentooify-tls-warning.patch diff --git a/tags/2.6.20-5/00000_README b/tags/2.6.20-5/00000_README new file mode 100644 index 0000000..7673e98 --- /dev/null +++ b/tags/2.6.20-5/00000_README @@ -0,0 +1,63 @@ +Xen Patches README +------------------ + +These patches are intended to be stacked on top of genpatches-base. + +Many of the patches included here are swiped from various sources which +use their own four digit patch numbering scheme, so we are stuck with five +digits to indiciate the source for easier tracking and re-syncing. + +Numbering +--------- + +0xxxx Gentoo, not related to Xen. (in case we pull something from extras) +1xxxx XenSource, upstream Xen patch for 2.6.18 +2xxxx Redhat, we use their Xen patch for >=2.6.20 +3xxxx Debian, we use their security fixes for 2.6.18 +4xxxx Misc +5xxxx Gentoo, Xen and other fixes for Redhat and/or Debian patches. + +Patches +------- + +20950_linux-2.6.20.14-xen-3.1.0.patch + Main Xen patch + +20xxx-? + Various bug-fix patches from Redhat. + +26000_linux-2.6-cve-2008-0600.patch + Fix the vmsplice issue CVE 2008-0600 from the 2.6.21 patchset. + +30037_amd64-zero-extend-32bit-ptrace-xen.patch + [SECURITY] Zero extend all registers after ptrace in 32-bit entry path + (Xen). + See CVE-2007-4573 + +40001_i386-fix-xen_l1_entry_update-for-highptes.patch + Fix for kernels compiled with CONFIG_HIGHPTE. + Pulled from linux-2.6.18-xen.hg, changeset e79729740288. + +50001_make-install.patch + Handle make install in a semi-sane way that plays nice with + split domU/dom0 kernels. + +50002_always-enable-xen-genapic.patch + Compile fix for non-SMP (UP) kernels. Since UP support is broken in + upstream Xen I'm not sure if I trust it or not. :-P + +50003_console-tty-fix.patch + Steal tty1-63 as the upstream Xen release does so people don't get + any supprises. Redhat switched to using the special Xen tty device. + +50004_quirks-no-smp-fix.patch + Another compile fix for non-SMP (UP) kernels. + +50006_pgetable-build-fix.patch + Fix a function re-definition error when PAE is not enabled. + +50008_reenable-tls-warning.patch + Issue only one big fat tls warning as upstream xen does. + +50009_gentooify-tls-warning.patch + Change tls warning instructions to apply directly to Gentoo. diff --git a/tags/2.6.20-5/20950_linux-2.6.20.14-xen-3.1.0.patch b/tags/2.6.20-5/20950_linux-2.6.20.14-xen-3.1.0.patch new file mode 100644 index 0000000..b46f4a4 --- /dev/null +++ b/tags/2.6.20-5/20950_linux-2.6.20.14-xen-3.1.0.patch @@ -0,0 +1,93230 @@ +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/Kconfig +--- a/arch/i386/Kconfig Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/Kconfig Wed Aug 08 16:25:28 2007 -0300 +@@ -16,6 +16,7 @@ config X86_32 + + config GENERIC_TIME + bool ++ depends on !X86_XEN + default y + + config LOCKDEP_SUPPORT +@@ -107,6 +108,15 @@ config X86_PC + bool "PC-compatible" + help + Choose this option if your computer is a standard PC or compatible. ++ ++config X86_XEN ++ bool "Xen-compatible" ++ select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST ++ select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST ++ select SWIOTLB ++ help ++ Choose this option if you plan to run this kernel on top of the ++ Xen Hypervisor. + + config X86_ELAN + bool "AMD Elan" +@@ -229,6 +239,7 @@ source "arch/i386/Kconfig.cpu" + + config HPET_TIMER + bool "HPET Timer Support" ++ depends on !X86_XEN + help + This enables the use of the HPET for the kernel's internal timer. + HPET is the next generation timer replacing legacy 8254s. +@@ -279,7 +290,7 @@ source "kernel/Kconfig.preempt" + + config X86_UP_APIC + bool "Local APIC support on uniprocessors" +- depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) ++ depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH || XEN_UNPRIVILEGED_GUEST) + help + A local APIC (Advanced Programmable Interrupt Controller) is an + integrated interrupt controller in the CPU. If you have a single-CPU +@@ -304,12 +315,12 @@ config X86_UP_IOAPIC + + config X86_LOCAL_APIC + bool +- depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH ++ depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) || X86_GENERICARCH + default y + + config X86_IO_APIC + bool +- depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH ++ depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) || X86_GENERICARCH + default y + + config X86_VISWS_APIC +@@ -319,7 +330,7 @@ config X86_VISWS_APIC + + config X86_MCE + bool "Machine Check Exception" +- depends on !X86_VOYAGER ++ depends on !(X86_VOYAGER || X86_XEN) + ---help--- + Machine Check Exception support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, component failure). +@@ -418,6 +429,7 @@ config X86_REBOOTFIXUPS + + config MICROCODE + tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" ++ depends on !XEN_UNPRIVILEGED_GUEST + select FW_LOADER + ---help--- + If you say Y here and also to "/dev file system support" in the +@@ -441,6 +453,7 @@ config MICROCODE_OLD_INTERFACE + + config X86_MSR + tristate "/dev/cpu/*/msr - Model-specific register support" ++ depends on !X86_XEN + help + This device gives privileged processes access to the x86 + Model-Specific Registers (MSRs). It is a character device with +@@ -455,6 +468,10 @@ config X86_CPUID + be executed on a specific processor. It is a character device + with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to + /dev/cpu/31/cpuid. ++ ++config SWIOTLB ++ bool ++ default n + + source "drivers/firmware/Kconfig" + +@@ -638,6 +655,7 @@ config HIGHPTE + + config MATH_EMULATION + bool "Math emulation" ++ depends on !X86_XEN + ---help--- + Linux can emulate a math coprocessor (used for floating point + operations) if you don't have one. 486DX and Pentium processors have +@@ -663,6 +681,8 @@ config MATH_EMULATION + + config MTRR + bool "MTRR (Memory Type Range Register) support" ++ depends on !XEN_UNPRIVILEGED_GUEST ++ default y if X86_XEN + ---help--- + On Intel P6 family processors (Pentium Pro, Pentium II and later) + the Memory Type Range Registers (MTRRs) may be used to control +@@ -697,7 +717,7 @@ config MTRR + + config EFI + bool "Boot from EFI support" +- depends on ACPI ++ depends on ACPI && !X86_XEN + default n + ---help--- + This enables the kernel to boot on EFI platforms using +@@ -715,7 +735,7 @@ config EFI + + config IRQBALANCE + bool "Enable kernel irq balancing" +- depends on SMP && X86_IO_APIC ++ depends on SMP && X86_IO_APIC && !X86_XEN + default y + help + The default yes will allow the kernel to do irq load balancing. +@@ -749,6 +769,7 @@ source kernel/Kconfig.hz + + config KEXEC + bool "kexec system call" ++ depends on !XEN_UNPRIVILEGED_GUEST + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot +@@ -866,6 +887,7 @@ config COMPAT_VDSO + bool "Compat VDSO support" + default y + depends on !PARAVIRT ++ depends on !X86_XEN + help + Map the VDSO to the predictable old-style address too. + ---help--- +@@ -882,18 +904,20 @@ config ARCH_ENABLE_MEMORY_HOTPLUG + depends on HIGHMEM + + menu "Power management options (ACPI, APM)" +- depends on !X86_VOYAGER +- ++ depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) ++ ++if !X86_XEN + source kernel/power/Kconfig ++endif + + source "drivers/acpi/Kconfig" + + menu "APM (Advanced Power Management) BIOS Support" +-depends on PM && !X86_VISWS ++depends on PM && !(X86_VISWS || X86_XEN) + + config APM + tristate "APM (Advanced Power Management) BIOS support" +- depends on PM ++ depends on PM && PM_LEGACY + ---help--- + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with +@@ -1078,6 +1102,7 @@ choice + + config PCI_GOBIOS + bool "BIOS" ++ depends on !X86_XEN + + config PCI_GOMMCONFIG + bool "MMConfig" +@@ -1085,6 +1110,13 @@ config PCI_GODIRECT + config PCI_GODIRECT + bool "Direct" + ++config PCI_GOXEN_FE ++ bool "Xen PCI Frontend" ++ depends on X86_XEN ++ help ++ The PCI device frontend driver allows the kernel to import arbitrary ++ PCI devices from a PCI backend to support PCI driver domains. ++ + config PCI_GOANY + bool "Any" + +@@ -1092,7 +1124,7 @@ endchoice + + config PCI_BIOS + bool +- depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) ++ depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY) + default y + + config PCI_DIRECT +@@ -1104,6 +1136,18 @@ config PCI_MMCONFIG + bool + depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) + default y ++ ++config XEN_PCIDEV_FRONTEND ++ bool ++ depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY) ++ default y ++ ++config XEN_PCIDEV_FE_DEBUG ++ bool "Xen PCI Frontend Debugging" ++ depends on XEN_PCIDEV_FRONTEND ++ default n ++ help ++ Enables some debug statements within the PCI Frontend. + + source "drivers/pci/pcie/Kconfig" + +@@ -1115,7 +1159,7 @@ config ISA_DMA_API + + config ISA + bool "ISA support" +- depends on !(X86_VOYAGER || X86_VISWS) ++ depends on !(X86_VOYAGER || X86_VISWS || X86_XEN) + help + Find out whether you have ISA slots on your motherboard. ISA is the + name of a bus system, i.e. the way the CPU talks to the other stuff +@@ -1142,7 +1186,7 @@ source "drivers/eisa/Kconfig" + source "drivers/eisa/Kconfig" + + config MCA +- bool "MCA support" if !(X86_VISWS || X86_VOYAGER) ++ bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN) + default y if X86_VOYAGER + help + MicroChannel Architecture is found in some IBM PS/2 machines and +@@ -1218,6 +1262,8 @@ source "security/Kconfig" + + source "crypto/Kconfig" + ++source "drivers/xen/Kconfig" ++ + source "lib/Kconfig" + + # +@@ -1243,7 +1289,7 @@ config X86_SMP + + config X86_HT + bool +- depends on SMP && !(X86_VISWS || X86_VOYAGER) ++ depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN) + default y + + config X86_BIOS_REBOOT +@@ -1256,6 +1302,16 @@ config X86_TRAMPOLINE + depends on X86_SMP || (X86_VOYAGER && SMP) + default y + ++config X86_NO_TSS ++ bool ++ depends on X86_XEN ++ default y ++ ++config X86_NO_IDT ++ bool ++ depends on X86_XEN ++ default y ++ + config KTIME_SCALAR + bool + default y +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/Kconfig.cpu +--- a/arch/i386/Kconfig.cpu Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/Kconfig.cpu Wed Aug 08 16:25:28 2007 -0300 +@@ -267,7 +267,7 @@ config X86_PPRO_FENCE + + config X86_F00F_BUG + bool +- depends on M586MMX || M586TSC || M586 || M486 || M386 ++ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT + default y + + config X86_WP_WORKS_OK +@@ -327,5 +327,5 @@ config X86_OOSTORE + + config X86_TSC + bool +- depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ +- default y ++ depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ && !X86_XEN ++ default y +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/Kconfig.debug +--- a/arch/i386/Kconfig.debug Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/Kconfig.debug Wed Aug 08 16:25:28 2007 -0300 +@@ -79,6 +79,7 @@ config DOUBLEFAULT + config DOUBLEFAULT + default y + bool "Enable doublefault exception handler" if EMBEDDED ++ depends on !X86_NO_TSS + help + This option allows trapping of rare doublefault exceptions that + would otherwise cause a system to silently reboot. Disabling this +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/Makefile +--- a/arch/i386/Makefile Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/Makefile Wed Aug 08 16:25:28 2007 -0300 +@@ -60,6 +60,11 @@ AFLAGS += $(call as-instr,.cfi_startproc + + CFLAGS += $(cflags-y) + ++cppflags-$(CONFIG_XEN) += \ ++ -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) ++ ++CPPFLAGS += $(cppflags-y) ++ + # Default subarch .c files + mcore-y := mach-default + +@@ -82,6 +87,10 @@ mcore-$(CONFIG_X86_BIGSMP) := mach-defau + #Summit subarch support + mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit + mcore-$(CONFIG_X86_SUMMIT) := mach-default ++ ++# Xen subarch support ++mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen ++mcore-$(CONFIG_X86_XEN) := mach-xen + + # generic subarchitecture + mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic +@@ -117,6 +126,19 @@ PHONY += zImage bzImage compressed zlilo + PHONY += zImage bzImage compressed zlilo bzlilo \ + zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install + ++ifdef CONFIG_XEN ++CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS) ++head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task.o ++boot := arch/i386/boot-xen ++.PHONY: vmlinuz ++all: vmlinuz ++ ++vmlinuz: vmlinux ++ $(Q)$(MAKE) $(build)=$(boot) $@ ++ ++install: ++ $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@ ++else + all: bzImage + + # KBUILD_IMAGE specify target image being built +@@ -139,6 +161,7 @@ fdimage fdimage144 fdimage288 isoimage: + + install: + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install ++endif + + archclean: + $(Q)$(MAKE) $(clean)=arch/i386/boot +@@ -157,3 +180,4 @@ CLEAN_FILES += arch/$(ARCH)/boot/fdimage + CLEAN_FILES += arch/$(ARCH)/boot/fdimage \ + arch/$(ARCH)/boot/image.iso \ + arch/$(ARCH)/boot/mtools.conf ++CLEAN_FILES += vmlinuz vmlinux-stripped +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/boot-xen/Makefile +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/boot-xen/Makefile Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,21 @@ ++ ++OBJCOPYFLAGS := -g --strip-unneeded ++ ++vmlinuz: vmlinux-stripped FORCE ++ $(call if_changed,gzip) ++ ++vmlinux-stripped: vmlinux FORCE ++ $(call if_changed,objcopy) ++ ++INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH)) ++ ++XINSTALL_NAME ?= $(KERNELRELEASE) ++install: ++ mkdir -p $(INSTALL_ROOT)/boot ++ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) ++ rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) ++ install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) ++ install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX) ++ install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX) ++ install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX) ++ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/Makefile +--- a/arch/i386/kernel/Makefile Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/Makefile Wed Aug 08 16:25:28 2007 -0300 +@@ -47,6 +47,12 @@ EXTRA_AFLAGS := -traditional + + obj-$(CONFIG_SCx200) += scx200.o + ++ifdef CONFIG_XEN ++vsyscall_note := vsyscall-note-xen.o ++else ++vsyscall_note := vsyscall-note.o ++endif ++ + # vsyscall.o contains the vsyscall DSO images as __initdata. + # We must build both images before we can assemble it. + # Note: kbuild does not track this dependency due to usage of .incbin +@@ -68,7 +74,7 @@ SYSCFLAGS_vsyscall-int80.so = $(vsyscall + + $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ + $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ +- $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE ++ $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE + $(call if_changed,syscall) + + # We also create a special relocatable object that should mirror the symbol +@@ -80,9 +86,21 @@ extra-y += vsyscall-syms.o + + SYSCFLAGS_vsyscall-syms.o = -r + $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ +- $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE ++ $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE + $(call if_changed,syscall) + + k8-y += ../../x86_64/kernel/k8.o + stacktrace-y += ../../x86_64/kernel/stacktrace.o + ++ifdef CONFIG_XEN ++include $(srctree)/scripts/Makefile.xen ++ ++obj-y += fixup.o ++microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o ++n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o ++ ++obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) ++obj-y := $(call cherrypickxen, $(obj-y)) ++extra-y := $(call cherrypickxen, $(extra-y)) ++%/head-xen.o %/head-xen.s: EXTRA_AFLAGS := ++endif +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/acpi/Makefile +--- a/arch/i386/kernel/acpi/Makefile Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/acpi/Makefile Wed Aug 08 16:25:28 2007 -0300 +@@ -7,4 +7,3 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),) + ifneq ($(CONFIG_ACPI_PROCESSOR),) + obj-y += cstate.o processor.o + endif +- +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/acpi/boot.c +--- a/arch/i386/kernel/acpi/boot.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/acpi/boot.c Wed Aug 08 16:25:28 2007 -0300 +@@ -107,7 +107,7 @@ EXPORT_SYMBOL(x86_acpiid_to_apicid); + */ + enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + +-#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + + /* rely on all ACPI tables being in the direct mapping */ + char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +@@ -140,8 +140,10 @@ char *__acpi_map_table(unsigned long phy + unsigned long base, offset, mapped_size; + int idx; + ++#ifndef CONFIG_XEN + if (phys + size < 8 * 1024 * 1024) + return __va(phys); ++#endif + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; +@@ -611,7 +613,13 @@ acpi_scan_rsdp(unsigned long start, unsi + * RSDP signature. + */ + for (offset = 0; offset < length; offset += 16) { ++#ifdef CONFIG_XEN ++ unsigned long vstart = (unsigned long)isa_bus_to_virt(start); ++ ++ if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len)) ++#else + if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len)) ++#endif + continue; + return (start + offset); + } +@@ -724,7 +732,7 @@ static int __init acpi_parse_fadt(unsign + acpi_fadt.force_apic_physical_destination_mode = + fadt->force_apic_physical_destination_mode; + +-#ifdef CONFIG_X86_PM_TIMER ++#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN) + /* detect the location of the ACPI PM Timer */ + if (fadt->revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/apic-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/apic-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,223 @@ ++/* ++ * Local APIC handling, local APIC timers ++ * ++ * (c) 1999, 2000 Ingo Molnar ++ * ++ * Fixes ++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; ++ * thanks to Eric Gilmore ++ * and Rolf G. Tews ++ * for testing these extensively. ++ * Maciej W. Rozycki : Various updates and fixes. ++ * Mikael Pettersson : Power Management for UP-APIC. ++ * Pavel Machek and ++ * Mikael Pettersson : PM converted to driver model. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "io_ports.h" ++ ++#ifndef CONFIG_XEN ++/* ++ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as ++ * IPIs in place of local APIC timers ++ */ ++static cpumask_t timer_bcast_ipi; ++#endif ++ ++/* ++ * Knob to control our willingness to enable the local APIC. ++ */ ++static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ ++ ++static inline void lapic_disable(void) ++{ ++ enable_local_apic = -1; ++ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); ++} ++ ++static inline void lapic_enable(void) ++{ ++ enable_local_apic = 1; ++} ++ ++/* ++ * Debug level ++ */ ++int apic_verbosity; ++ ++static int modern_apic(void) ++{ ++#ifndef CONFIG_XEN ++ unsigned int lvr, version; ++ /* AMD systems use old APIC versions, so check the CPU */ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && ++ boot_cpu_data.x86 >= 0xf) ++ return 1; ++ lvr = apic_read(APIC_LVR); ++ version = GET_APIC_VERSION(lvr); ++ return version >= 0x14; ++#else ++ return 1; ++#endif ++} ++ ++/* ++ * 'what should we do if we get a hw irq event on an illegal vector'. ++ * each architecture has to answer this themselves. ++ */ ++void ack_bad_irq(unsigned int irq) ++{ ++ printk("unexpected IRQ trap at vector %02x\n", irq); ++ /* ++ * Currently unexpected vectors happen only on SMP and APIC. ++ * We _must_ ack these because every local APIC has only N ++ * irq slots per priority level, and a 'hanging, unacked' IRQ ++ * holds up an irq slot - in excessive cases (when multiple ++ * unexpected vectors occur) that might lock up the APIC ++ * completely. ++ * But only ack when the APIC is enabled -AK ++ */ ++ if (cpu_has_apic) ++ ack_APIC_irq(); ++} ++ ++#ifndef CONFIG_XEN ++void __init apic_intr_init(void) ++{ ++#ifdef CONFIG_SMP ++ smp_intr_init(); ++#endif ++ /* self generated IPI for local APIC timer */ ++ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); ++ ++ /* IPI vectors for APIC spurious and error interrupts */ ++ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); ++ set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); ++ ++ /* thermal monitor LVT interrupt */ ++#ifdef CONFIG_X86_MCE_P4THERMAL ++ set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); ++#endif ++} ++ ++/* Using APIC to generate smp_local_timer_interrupt? */ ++int using_apic_timer __read_mostly = 0; ++ ++static int enabled_via_apicbase; ++ ++void enable_NMI_through_LVT0 (void * dummy) ++{ ++ unsigned int v, ver; ++ ++ ver = apic_read(APIC_LVR); ++ ver = GET_APIC_VERSION(ver); ++ v = APIC_DM_NMI; /* unmask and set to NMI */ ++ if (!APIC_INTEGRATED(ver)) /* 82489DX */ ++ v |= APIC_LVT_LEVEL_TRIGGER; ++ apic_write_around(APIC_LVT0, v); ++} ++#endif /* !CONFIG_XEN */ ++ ++int get_physical_broadcast(void) ++{ ++ if (modern_apic()) ++ return 0xff; ++ else ++ return 0xf; ++} ++ ++#ifndef CONFIG_XEN ++#ifndef CONFIG_SMP ++static void up_apic_timer_interrupt_call(void) ++{ ++ int cpu = smp_processor_id(); ++ ++ /* ++ * the NMI deadlock-detector uses this. ++ */ ++ per_cpu(irq_stat, cpu).apic_timer_irqs++; ++ ++ smp_local_timer_interrupt(); ++} ++#endif ++ ++void smp_send_timer_broadcast_ipi(void) ++{ ++ cpumask_t mask; ++ ++ cpus_and(mask, cpu_online_map, timer_bcast_ipi); ++ if (!cpus_empty(mask)) { ++#ifdef CONFIG_SMP ++ send_IPI_mask(mask, LOCAL_TIMER_VECTOR); ++#else ++ /* ++ * We can directly call the apic timer interrupt handler ++ * in UP case. Minus all irq related functions ++ */ ++ up_apic_timer_interrupt_call(); ++#endif ++ } ++} ++#endif ++ ++int setup_profiling_timer(unsigned int multiplier) ++{ ++ return -EINVAL; ++} ++ ++/* ++ * This initializes the IO-APIC and APIC hardware if this is ++ * a UP kernel. ++ */ ++int __init APIC_init_uniprocessor (void) ++{ ++#ifdef CONFIG_X86_IO_APIC ++ if (smp_found_config) ++ if (!skip_ioapic_setup && nr_ioapics) ++ setup_IO_APIC(); ++#endif ++ ++ return 0; ++} ++ ++static int __init parse_lapic(char *arg) ++{ ++ lapic_enable(); ++ return 0; ++} ++early_param("lapic", parse_lapic); ++ ++static int __init parse_nolapic(char *arg) ++{ ++ lapic_disable(); ++ return 0; ++} ++early_param("nolapic", parse_nolapic); ++ +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/asm-offsets.c +--- a/arch/i386/kernel/asm-offsets.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/asm-offsets.c Wed Aug 08 16:25:28 2007 -0300 +@@ -89,9 +89,14 @@ void foo(void) + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + ++#ifndef CONFIG_X86_NO_TSS + /* Offset from the sysenter stack to tss.esp0 */ +- DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - ++ DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) - + sizeof(struct tss_struct)); ++#else ++ /* sysenter stack points directly to esp0 */ ++ DEFINE(SYSENTER_stack_esp0, 0); ++#endif + + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(VDSO_PRELINK, VDSO_PRELINK); +@@ -111,4 +116,10 @@ void foo(void) + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); + #endif ++ ++ ++#ifdef CONFIG_XEN ++ BLANK(); ++ OFFSET(XEN_START_mfn_list, start_info, mfn_list); ++#endif + } +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/cpu/common.c +--- a/arch/i386/kernel/cpu/common.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/cpu/common.c Wed Aug 08 16:25:28 2007 -0300 +@@ -19,6 +19,10 @@ + #include + #endif + #include ++ ++#ifdef CONFIG_XEN ++#include ++#endif + + #include "cpu.h" + +@@ -601,6 +605,31 @@ void __init early_cpu_init(void) + #endif + } + ++/* We can't move load_gdt to asm/desc.h because it lacks make_lowmen_page_readonly() ++ definition, and as this is still the only user of load_gdt in xen. ++ ToDo: JQ ++ */ ++ ++#ifdef CONFIG_XEN ++#undef load_gdt ++static void __cpuinit load_gdt(struct Xgt_desc_struct *gdt_descr) ++{ ++ unsigned long frames[16]; ++ unsigned long va; ++ int f; ++ ++ for (va = gdt_descr->address, f = 0; ++ va < gdt_descr->address + gdt_descr->size; ++ va += PAGE_SIZE, f++) { ++ frames[f] = virt_to_mfn(va); ++ make_lowmem_page_readonly( ++ (void *)va, XENFEAT_writable_descriptor_tables); ++ } ++ if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) ++ BUG(); ++} ++#endif /* CONFIG_XEN */ ++ + /* Make sure %gs is initialized properly in idle threads */ + struct pt_regs * __devinit idle_regs(struct pt_regs *regs) + { +@@ -633,6 +662,10 @@ static __cpuinit int alloc_gdt(int cpu) + + memset(gdt, 0, PAGE_SIZE); + memset(pda, 0, sizeof(*pda)); ++#ifdef CONFIG_XEN ++ memcpy(gdt, cpu_gdt_table, GDT_SIZE); ++ cpu_gdt_descr->size = GDT_SIZE; ++#endif + } else { + /* GDT and PDA might already have been allocated if + this is a CPU hotplug re-insertion. */ +@@ -690,14 +723,18 @@ __cpuinit int init_gdt(int cpu, struct t + + BUG_ON(gdt == NULL || pda == NULL); + ++#ifndef CONFIG_XEN + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + cpu_gdt_descr->size = GDT_SIZE - 1; +- +- pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, ++#endif ++ ++ ++ if (cpu == 0) ++ pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, + (u32 *)&gdt[GDT_ENTRY_PDA].b, + (unsigned long)pda, sizeof(*pda) - 1, + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ +@@ -724,7 +761,9 @@ void __cpuinit cpu_set_gdt(int cpu) + /* Common CPU init for both boot and secondary CPUs */ + static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) + { ++#ifndef CONFIG_X86_NO_TSS + struct tss_struct * t = &per_cpu(init_tss, cpu); ++#endif + struct thread_struct *thread = &curr->thread; + + if (cpu_test_and_set(cpu, cpu_initialized)) { +@@ -743,7 +782,9 @@ static void __cpuinit _cpu_init(int cpu, + set_in_cr4(X86_CR4_TSD); + } + ++#ifndef CONFIG_X86_NO_IDT + load_idt(&idt_descr); ++#endif + + /* + * Set up and load the per-CPU TSS and LDT +@@ -755,8 +796,10 @@ static void __cpuinit _cpu_init(int cpu, + enter_lazy_tlb(&init_mm, curr); + + load_esp0(t, thread); ++#ifndef CONFIG_X86_NO_TSS + set_tss_desc(cpu,t); + load_TR_desc(); ++#endif + load_LDT(&init_mm.context); + + #ifdef CONFIG_DOUBLEFAULT +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/cpu/mtrr/Makefile +--- a/arch/i386/kernel/cpu/mtrr/Makefile Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/cpu/mtrr/Makefile Wed Aug 08 16:25:28 2007 -0300 +@@ -1,3 +1,10 @@ obj-y := main.o if.o generic.o state.o + obj-y := main.o if.o generic.o state.o + obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o + ++ifdef CONFIG_XEN ++include $(srctree)/scripts/Makefile.xen ++n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o ++ ++obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) ++obj-y := $(call cherrypickxen, $(obj-y)) ++endif +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/cpu/mtrr/main-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/cpu/mtrr/main-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,198 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "mtrr.h" ++ ++static DEFINE_MUTEX(mtrr_mutex); ++ ++static void generic_get_mtrr(unsigned int reg, unsigned long *base, ++ unsigned long *size, mtrr_type * type) ++{ ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = reg; ++ (void)HYPERVISOR_platform_op(&op); ++ ++ *size = op.u.read_memtype.nr_mfns; ++ *base = op.u.read_memtype.mfn; ++ *type = op.u.read_memtype.type; ++} ++ ++struct mtrr_ops generic_mtrr_ops = { ++ .use_intel_if = 1, ++ .get = generic_get_mtrr, ++}; ++ ++struct mtrr_ops *mtrr_if = &generic_mtrr_ops; ++unsigned int num_var_ranges; ++unsigned int *usage_table; ++ ++/* This function returns the number of variable MTRRs */ ++static void __init set_num_var_ranges(void) ++{ ++ struct xen_platform_op op; ++ ++ for (num_var_ranges = 0; ; num_var_ranges++) { ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = num_var_ranges; ++ if (HYPERVISOR_platform_op(&op) != 0) ++ break; ++ } ++} ++ ++static void __init init_table(void) ++{ ++ int i, max; ++ ++ max = num_var_ranges; ++ if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) ++ == NULL) { ++ printk(KERN_ERR "mtrr: could not allocate\n"); ++ return; ++ } ++ for (i = 0; i < max; i++) ++ usage_table[i] = 0; ++} ++ ++int mtrr_add_page(unsigned long base, unsigned long size, ++ unsigned int type, char increment) ++{ ++ int error; ++ struct xen_platform_op op; ++ ++ mutex_lock(&mtrr_mutex); ++ ++ op.cmd = XENPF_add_memtype; ++ op.u.add_memtype.mfn = base; ++ op.u.add_memtype.nr_mfns = size; ++ op.u.add_memtype.type = type; ++ error = HYPERVISOR_platform_op(&op); ++ if (error) { ++ mutex_unlock(&mtrr_mutex); ++ BUG_ON(error > 0); ++ return error; ++ } ++ ++ if (increment) ++ ++usage_table[op.u.add_memtype.reg]; ++ ++ mutex_unlock(&mtrr_mutex); ++ ++ return op.u.add_memtype.reg; ++} ++ ++static int mtrr_check(unsigned long base, unsigned long size) ++{ ++ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { ++ printk(KERN_WARNING ++ "mtrr: size and base must be multiples of 4 kiB\n"); ++ printk(KERN_DEBUG ++ "mtrr: size: 0x%lx base: 0x%lx\n", size, base); ++ dump_stack(); ++ return -1; ++ } ++ return 0; ++} ++ ++int ++mtrr_add(unsigned long base, unsigned long size, unsigned int type, ++ char increment) ++{ ++ if (mtrr_check(base, size)) ++ return -EINVAL; ++ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, ++ increment); ++} ++ ++int mtrr_del_page(int reg, unsigned long base, unsigned long size) ++{ ++ unsigned i; ++ mtrr_type ltype; ++ unsigned long lbase, lsize; ++ int error = -EINVAL; ++ struct xen_platform_op op; ++ ++ mutex_lock(&mtrr_mutex); ++ ++ if (reg < 0) { ++ /* Search for existing MTRR */ ++ for (i = 0; i < num_var_ranges; ++i) { ++ mtrr_if->get(i, &lbase, &lsize, <ype); ++ if (lbase == base && lsize == size) { ++ reg = i; ++ break; ++ } ++ } ++ if (reg < 0) { ++ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, ++ size); ++ goto out; ++ } ++ } ++ if (usage_table[reg] < 1) { ++ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); ++ goto out; ++ } ++ if (--usage_table[reg] < 1) { ++ op.cmd = XENPF_del_memtype; ++ op.u.del_memtype.handle = 0; ++ op.u.del_memtype.reg = reg; ++ error = HYPERVISOR_platform_op(&op); ++ if (error) { ++ BUG_ON(error > 0); ++ goto out; ++ } ++ } ++ error = reg; ++ out: ++ mutex_unlock(&mtrr_mutex); ++ return error; ++} ++ ++int ++mtrr_del(int reg, unsigned long base, unsigned long size) ++{ ++ if (mtrr_check(base, size)) ++ return -EINVAL; ++ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); ++} ++ ++EXPORT_SYMBOL(mtrr_add); ++EXPORT_SYMBOL(mtrr_del); ++ ++void __init mtrr_bp_init(void) ++{ ++} ++ ++void mtrr_ap_init(void) ++{ ++} ++ ++static int __init mtrr_init(void) ++{ ++ struct cpuinfo_x86 *c = &boot_cpu_data; ++ ++ if (!is_initial_xendomain()) ++ return -ENODEV; ++ ++ if ((!cpu_has(c, X86_FEATURE_MTRR)) && ++ (!cpu_has(c, X86_FEATURE_K6_MTRR)) && ++ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && ++ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) ++ return -ENODEV; ++ ++ set_num_var_ranges(); ++ init_table(); ++ ++ return 0; ++} ++ ++subsys_initcall(mtrr_init); +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/crash.c +--- a/arch/i386/kernel/crash.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/crash.c Wed Aug 08 16:25:28 2007 -0300 +@@ -31,6 +31,7 @@ + /* This keeps a track of which one is crashing cpu. */ + static int crashing_cpu; + ++#ifndef CONFIG_XEN + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + static atomic_t waiting_for_crash_ipi; + +@@ -112,6 +113,7 @@ static void nmi_shootdown_cpus(void) + /* There are no cpus to shootdown */ + } + #endif ++#endif /* CONFIG_XEN */ + + void machine_crash_shutdown(struct pt_regs *regs) + { +@@ -128,10 +130,12 @@ void machine_crash_shutdown(struct pt_re + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); ++#ifndef CONFIG_XEN + nmi_shootdown_cpus(); + lapic_shutdown(); + #if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); + #endif ++#endif /* CONFIG_XEN */ + crash_save_cpu(regs, safe_smp_processor_id()); + } +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/e820.c +--- a/arch/i386/kernel/e820.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/e820.c Wed Aug 08 16:25:28 2007 -0300 +@@ -15,10 +15,24 @@ + #include + #include + ++#ifdef CONFIG_XEN ++#include ++#include ++#include ++#include ++#include ++#endif ++ + #ifdef CONFIG_EFI + int efi_enabled = 0; + EXPORT_SYMBOL(efi_enabled); + #endif ++ ++#ifdef CONFIG_XEN ++struct e820map machine_e820; ++#endif ++static void __init ++e820_setup_gap(struct e820entry *e820, int nr_map); + + struct e820map e820; + struct change_member { +@@ -180,6 +194,12 @@ static void __init probe_roms(void) + unsigned char *rom; + int i; + ++#ifdef CONFIG_XEN ++ /* Nothing to do if not running in dom0. */ ++ if (!is_initial_xendomain()) ++ return; ++#endif ++ + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { +@@ -247,36 +267,54 @@ legacy_init_iomem_resources(struct resou + legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) + { + int i; ++ struct e820entry *map = e820.map; ++ int nr_map = e820.nr_map; ++#ifdef CONFIG_XEN_PRIVILEGED_GUEST ++ struct xen_memory_map memmap; ++ ++ map = machine_e820.map; ++ memmap.nr_entries = E820MAX; ++ ++ set_xen_guest_handle(memmap.buffer, map); ++ ++ if(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) ++ BUG(); ++ machine_e820.nr_map = memmap.nr_entries; ++ nr_map = memmap.nr_entries; ++ e820_setup_gap(map, memmap.nr_entries); ++#endif + + probe_roms(); +- for (i = 0; i < e820.nr_map; i++) { ++ for (i = 0; i < nr_map; i++) { + struct resource *res; + #ifndef CONFIG_RESOURCES_64BIT +- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) ++ if (map[i].addr + map[i].size > 0x100000000ULL) + continue; + #endif + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); +- switch (e820.map[i].type) { ++ switch (map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } +- res->start = e820.map[i].addr; +- res->end = res->start + e820.map[i].size - 1; ++ res->start = map[i].addr; ++ res->end = res->start + map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } +- if (e820.map[i].type == E820_RAM) { ++ if (map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ ++#ifndef CONFIG_XEN + request_resource(res, code_resource); + request_resource(res, data_resource); ++#endif + #ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); + #endif +@@ -295,6 +333,11 @@ static int __init request_standard_resou + int i; + + printk("Setting up standard PCI resources\n"); ++#ifdef CONFIG_XEN ++ /* Nothing to do if not running in dom0. */ ++ if (!is_initial_xendomain()) ++ return 0; ++#endif + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else +@@ -514,10 +557,13 @@ int __init sanitize_e820_map(struct e820 + */ + int __init copy_e820_map(struct e820entry * biosmap, int nr_map) + { ++#ifndef CONFIG_XEN + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; +- ++#else ++ BUG_ON(nr_map < 1); ++#endif + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; +@@ -529,6 +575,7 @@ int __init copy_e820_map(struct e820entr + if (start > end) + return -1; + ++#ifndef CONFIG_XEN + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. +@@ -549,6 +596,7 @@ int __init copy_e820_map(struct e820entr + size = end - start; + } + } ++#endif + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +@@ -653,6 +701,15 @@ void __init register_bootmem_low_pages(u + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + ++#ifdef CONFIG_XEN ++ /* ++ * Truncate to the number of actual pages currently ++ * present. ++ */ ++ if (last_pfn > xen_start_info->nr_pages) ++ last_pfn = xen_start_info->nr_pages; ++#endif ++ + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + +@@ -668,7 +725,12 @@ void __init register_bootmem_low_pages(u + } + } + +-void __init e820_register_memory(void) ++/* ++ * Locate a unused range of the physical address space below 4G which ++ * can be used for PCI mappings. ++ */ ++static void __init ++e820_setup_gap(struct e820entry *e820, int nr_map) + { + unsigned long gapstart, gapsize, round; + unsigned long long last; +@@ -681,10 +743,10 @@ void __init e820_register_memory(void) + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; +- i = e820.nr_map; ++ i = nr_map; + while (--i >= 0) { +- unsigned long long start = e820.map[i].addr; +- unsigned long long end = start + e820.map[i].size; ++ unsigned long long start = e820[i].addr; ++ unsigned long long end = start + e820[i].size; + + /* + * Since "last" is at most 4GB, we know we'll +@@ -714,6 +776,13 @@ void __init e820_register_memory(void) + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); ++} ++ ++void __init e820_register_memory(void) ++{ ++#ifndef CONFIG_XEN ++ e820_setup_gap(e820.map, e820.nr_map); ++#endif + } + + void __init print_memory_map(char *who) +@@ -784,7 +853,7 @@ static __init __always_inline void efi_l + + void __init limit_regions(unsigned long long size) + { +- unsigned long long current_addr; ++ unsigned long long current_addr = 0; + int i; + + print_memory_map("limit_regions start"); +@@ -813,6 +882,19 @@ void __init limit_regions(unsigned long + print_memory_map("limit_regions endfor"); + return; + } ++#ifdef CONFIG_XEN ++ if (i==e820.nr_map && current_addr < size) { ++ /* ++ * The e820 map finished before our requested size so ++ * extend the final entry to the requested address. ++ */ ++ --i; ++ if (e820.map[i].type == E820_RAM) ++ e820.map[i].size -= current_addr - size; ++ else ++ add_memory_region(current_addr, size - current_addr, E820_RAM); ++ } ++#endif + print_memory_map("limit_regions endfunc"); + } + +@@ -828,8 +910,15 @@ e820_all_mapped(unsigned long s, unsigne + u64 start = s; + u64 end = e; + int i; ++#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; ++i) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/entry-xen.S +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/entry-xen.S Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,1264 @@ ++/* ++ * linux/arch/i386/entry.S ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ */ ++ ++/* ++ * entry.S contains the system-call and fault low-level handling routines. ++ * This also contains the timer-interrupt handler, as well as all interrupts ++ * and faults that can result in a task-switch. ++ * ++ * NOTE: This code handles signal-recognition, which happens every time ++ * after a timer-interrupt and after each system call. ++ * ++ * I changed all the .align's to 4 (16 byte alignment), as that's faster ++ * on a 486. ++ * ++ * Stack layout in 'ret_from_system_call': ++ * ptrace needs to have all regs on the stack. ++ * if the order here is changed, it needs to be ++ * updated in fork.c:copy_process, signal.c:do_signal, ++ * ptrace.c and ptrace.h ++ * ++ * 0(%esp) - %ebx ++ * 4(%esp) - %ecx ++ * 8(%esp) - %edx ++ * C(%esp) - %esi ++ * 10(%esp) - %edi ++ * 14(%esp) - %ebp ++ * 18(%esp) - %eax ++ * 1C(%esp) - %ds ++ * 20(%esp) - %es ++ * 24(%esp) - %gs ++ * 28(%esp) - orig_eax ++ * 2C(%esp) - %eip ++ * 30(%esp) - %cs ++ * 34(%esp) - %eflags ++ * 38(%esp) - %oldesp ++ * 3C(%esp) - %oldss ++ * ++ * "current" is in register %ebx during any slow entries. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "irq_vectors.h" ++#include ++ ++/* ++ * We use macros for low-level operations which need to be overridden ++ * for paravirtualization. The following will never clobber any registers: ++ * INTERRUPT_RETURN (aka. "iret") ++ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") ++ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). ++ * ++ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must ++ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). ++ * Allowing a register to be clobbered can shrink the paravirt replacement ++ * enough to patch inline, increasing performance. ++ */ ++ ++#define nr_syscalls ((syscall_table_size)/4) ++ ++CF_MASK = 0x00000001 ++TF_MASK = 0x00000100 ++IF_MASK = 0x00000200 ++DF_MASK = 0x00000400 ++NT_MASK = 0x00004000 ++VM_MASK = 0x00020000 ++/* Pseudo-eflags. */ ++NMI_MASK = 0x80000000 ++ ++#ifdef CONFIG_XEN ++/* Offsets into shared_info_t. */ ++#define evtchn_upcall_pending /* 0 */ ++#define evtchn_upcall_mask 1 ++ ++#define sizeof_vcpu_shift 6 ++ ++#ifdef CONFIG_SMP ++#define GET_VCPU_INFO movl %gs:PDA_cpu,%esi ; \ ++ shl $sizeof_vcpu_shift,%esi ; \ ++ addl HYPERVISOR_shared_info,%esi ++#else ++#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi ++#endif ++ ++#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) ++#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) ++#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) ++#endif ++ ++#ifdef CONFIG_PREEMPT ++#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF ++#else ++#define preempt_stop(clobbers) ++#define resume_kernel restore_nocheck ++#endif ++ ++.macro TRACE_IRQS_IRET ++#ifdef CONFIG_TRACE_IRQFLAGS ++ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? ++ jz 1f ++ TRACE_IRQS_ON ++1: ++#endif ++.endm ++ ++#ifdef CONFIG_VM86 ++#define resume_userspace_sig check_userspace ++#else ++#define resume_userspace_sig resume_userspace ++#endif ++ ++#define SAVE_ALL \ ++ cld; \ ++ pushl %gs; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ /*CFI_REL_OFFSET gs, 0;*/\ ++ pushl %es; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ /*CFI_REL_OFFSET es, 0;*/\ ++ pushl %ds; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ /*CFI_REL_OFFSET ds, 0;*/\ ++ pushl %eax; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET eax, 0;\ ++ pushl %ebp; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET ebp, 0;\ ++ pushl %edi; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET edi, 0;\ ++ pushl %esi; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET esi, 0;\ ++ pushl %edx; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET edx, 0;\ ++ pushl %ecx; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET ecx, 0;\ ++ pushl %ebx; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET ebx, 0;\ ++ movl $(__USER_DS), %edx; \ ++ movl %edx, %ds; \ ++ movl %edx, %es; \ ++ movl $(__KERNEL_PDA), %edx; \ ++ movl %edx, %gs ++ ++#define RESTORE_INT_REGS \ ++ popl %ebx; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE ebx;\ ++ popl %ecx; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE ecx;\ ++ popl %edx; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE edx;\ ++ popl %esi; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE esi;\ ++ popl %edi; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE edi;\ ++ popl %ebp; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE ebp;\ ++ popl %eax; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE eax ++ ++#define RESTORE_REGS \ ++ RESTORE_INT_REGS; \ ++1: popl %ds; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ /*CFI_RESTORE ds;*/\ ++2: popl %es; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ /*CFI_RESTORE es;*/\ ++3: popl %gs; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ /*CFI_RESTORE gs;*/\ ++.pushsection .fixup,"ax"; \ ++4: movl $0,(%esp); \ ++ jmp 1b; \ ++5: movl $0,(%esp); \ ++ jmp 2b; \ ++6: movl $0,(%esp); \ ++ jmp 3b; \ ++.section __ex_table,"a";\ ++ .align 4; \ ++ .long 1b,4b; \ ++ .long 2b,5b; \ ++ .long 3b,6b; \ ++.popsection ++ ++#define RING0_INT_FRAME \ ++ CFI_STARTPROC simple;\ ++ CFI_SIGNAL_FRAME;\ ++ CFI_DEF_CFA esp, 3*4;\ ++ /*CFI_OFFSET cs, -2*4;*/\ ++ CFI_OFFSET eip, -3*4 ++ ++#define RING0_EC_FRAME \ ++ CFI_STARTPROC simple;\ ++ CFI_SIGNAL_FRAME;\ ++ CFI_DEF_CFA esp, 4*4;\ ++ /*CFI_OFFSET cs, -2*4;*/\ ++ CFI_OFFSET eip, -3*4 ++ ++#define RING0_PTREGS_FRAME \ ++ CFI_STARTPROC simple;\ ++ CFI_SIGNAL_FRAME;\ ++ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ ++ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ ++ CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ ++ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ ++ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ ++ CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ ++ CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ ++ CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ ++ CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ ++ CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ ++ CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ ++ CFI_OFFSET ebx, PT_EBX-PT_OLDESP ++ ++ENTRY(ret_from_fork) ++ CFI_STARTPROC ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ call schedule_tail ++ GET_THREAD_INFO(%ebp) ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ pushl $0x0202 # Reset kernel eflags ++ CFI_ADJUST_CFA_OFFSET 4 ++ popfl ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp syscall_exit ++ CFI_ENDPROC ++ ++/* ++ * Return to user mode is not as complex as all this looks, ++ * but we want the default path for a system call return to ++ * go as quickly as possible which is why some of this is ++ * less clear than it otherwise should be. ++ */ ++ ++ # userspace resumption stub bypassing syscall exit tracing ++ ALIGN ++ RING0_PTREGS_FRAME ++ret_from_exception: ++ preempt_stop(CLBR_ANY) ++ret_from_intr: ++ GET_THREAD_INFO(%ebp) ++check_userspace: ++ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS ++ movb PT_CS(%esp), %al ++ andl $(VM_MASK | SEGMENT_RPL_MASK), %eax ++ cmpl $USER_RPL, %eax ++ jb resume_kernel # not returning to v8086 or userspace ++ ++ENTRY(resume_userspace) ++ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt ++ # setting need_resched or sigpending ++ # between sampling and the iret ++ movl TI_flags(%ebp), %ecx ++ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on ++ # int/exception return? ++ jne work_pending ++ jmp restore_all ++ ++#ifdef CONFIG_PREEMPT ++ENTRY(resume_kernel) ++ DISABLE_INTERRUPTS(CLBR_ANY) ++ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? ++ jnz restore_nocheck ++need_resched: ++ movl TI_flags(%ebp), %ecx # need_resched set ? ++ testb $_TIF_NEED_RESCHED, %cl ++ jz restore_all ++ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? ++ jz restore_all ++ call preempt_schedule_irq ++ jmp need_resched ++#endif ++ CFI_ENDPROC ++ ++/* SYSENTER_RETURN points to after the "sysenter" instruction in ++ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ ++ ++ # sysenter call handler stub ++ENTRY(sysenter_entry) ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA esp, 0 ++ CFI_REGISTER esp, ebp ++ movl SYSENTER_stack_esp0(%esp),%esp ++sysenter_past_esp: ++ /* ++ * No need to follow this irqs on/off section: the syscall ++ * disabled irqs and here we enable it straight after entry: ++ */ ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ pushl $(__USER_DS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET ss, 0*/ ++ pushl %ebp ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET esp, 0 ++ pushfl ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $(__USER_CS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET cs, 0*/ ++#ifndef CONFIG_COMPAT_VDSO ++ /* ++ * Push current_thread_info()->sysenter_return to the stack. ++ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words ++ * pushed above; +8 corresponds to copy_thread's esp0 setting. ++ */ ++ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) ++#else ++ pushl $SYSENTER_RETURN ++#endif ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET eip, 0 ++ ++/* ++ * Load the potential sixth argument from user stack. ++ * Careful about security. ++ */ ++ cmpl $__PAGE_OFFSET-3,%ebp ++ jae syscall_fault ++1: movl (%ebp),%ebp ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,syscall_fault ++.previous ++ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ GET_THREAD_INFO(%ebp) ++ ++ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ ++ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) ++ jnz syscall_trace_entry ++ cmpl $(nr_syscalls), %eax ++ jae syscall_badsys ++ call *sys_call_table(,%eax,4) ++ movl %eax,PT_EAX(%esp) ++ DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) ++ TRACE_IRQS_OFF ++ movl TI_flags(%ebp), %ecx ++ testw $_TIF_ALLWORK_MASK, %cx ++ jne syscall_exit_work ++/* if something modifies registers it must also disable sysexit */ ++ movl PT_EIP(%esp), %edx ++ movl PT_OLDESP(%esp), %ecx ++ xorl %ebp,%ebp ++ TRACE_IRQS_ON ++1: mov PT_GS(%esp), %gs ++#ifdef CONFIG_XEN ++ __ENABLE_INTERRUPTS ++sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ++ __TEST_PENDING ++ jnz 14f # process more events if necessary... ++ movl PT_ESI(%esp), %esi ++ sysexit ++14: __DISABLE_INTERRUPTS ++ TRACE_IRQS_OFF ++sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ++ push %esp ++ CFI_ADJUST_CFA_OFFSET 4 ++ call evtchn_do_upcall ++ add $4,%esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp ret_from_intr ++#else ++ ENABLE_INTERRUPTS_SYSEXIT ++#endif /* !CONFIG_XEN */ ++ CFI_ENDPROC ++.pushsection .fixup,"ax" ++2: movl $0,PT_GS(%esp) ++ jmp 1b ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,2b ++.popsection ++ ++ # system call handler stub ++ENTRY(system_call) ++ RING0_INT_FRAME # can't unwind into user space anyway ++ pushl %eax # save orig_eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ GET_THREAD_INFO(%ebp) ++ testl $TF_MASK,PT_EFLAGS(%esp) ++ jz no_singlestep ++ orl $_TIF_SINGLESTEP,TI_flags(%ebp) ++no_singlestep: ++ # system call tracing in operation / emulation ++ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ ++ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) ++ jnz syscall_trace_entry ++ cmpl $(nr_syscalls), %eax ++ jae syscall_badsys ++syscall_call: ++ call *sys_call_table(,%eax,4) ++ movl %eax,PT_EAX(%esp) # store the return value ++syscall_exit: ++ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt ++ # setting need_resched or sigpending ++ # between sampling and the iret ++ TRACE_IRQS_OFF ++ movl TI_flags(%ebp), %ecx ++ testw $_TIF_ALLWORK_MASK, %cx # current->work ++ jne syscall_exit_work ++ ++restore_all: ++#ifndef CONFIG_XEN ++ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS ++ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we ++ # are returning to the kernel. ++ # See comments in process.c:copy_thread() for details. ++ movb PT_OLDSS(%esp), %ah ++ movb PT_CS(%esp), %al ++ andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax ++ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax ++ CFI_REMEMBER_STATE ++ je ldt_ss # returning to user-space with LDT SS ++restore_nocheck: ++#else ++restore_nocheck: ++ movl PT_EFLAGS(%esp), %eax ++ testl $(VM_MASK|NMI_MASK), %eax ++ CFI_REMEMBER_STATE ++ jnz hypervisor_iret ++ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF ++ GET_VCPU_INFO ++ andb evtchn_upcall_mask(%esi),%al ++ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask ++ CFI_REMEMBER_STATE ++ jnz restore_all_enable_events # != 0 => enable event delivery ++ CFI_REMEMBER_STATE ++#endif ++ TRACE_IRQS_IRET ++restore_nocheck_notrace: ++ RESTORE_REGS ++ addl $4, %esp # skip orig_eax/error_code ++ CFI_ADJUST_CFA_OFFSET -4 ++1: INTERRUPT_RETURN ++.section .fixup,"ax" ++iret_exc: ++#ifndef CONFIG_XEN ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++#endif ++ pushl $0 # no error code ++ pushl $do_iret_error ++ jmp error_code ++.previous ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,iret_exc ++.previous ++ ++ CFI_RESTORE_STATE ++#ifndef CONFIG_XEN ++ldt_ss: ++ larl PT_OLDSS(%esp), %eax ++ jnz restore_nocheck ++ testl $0x00400000, %eax # returning to 32bit stack? ++ jnz restore_nocheck # allright, normal return ++ ++#ifdef CONFIG_PARAVIRT ++ /* ++ * The kernel can't run on a non-flat stack if paravirt mode ++ * is active. Rather than try to fixup the high bits of ++ * ESP, bypass this code entirely. This may break DOSemu ++ * and/or Wine support in a paravirt VM, although the option ++ * is still available to implement the setting of the high ++ * 16-bits in the INTERRUPT_RETURN paravirt-op. ++ */ ++ cmpl $0, paravirt_ops+PARAVIRT_enabled ++ jne restore_nocheck ++#endif ++ ++ /* If returning to userspace with 16bit stack, ++ * try to fix the higher word of ESP, as the CPU ++ * won't restore it. ++ * This is an "official" bug of all the x86-compatible ++ * CPUs, which we can try to work around to make ++ * dosemu and wine happy. */ ++ movl PT_OLDESP(%esp), %eax ++ movl %esp, %edx ++ call patch_espfix_desc ++ pushl $__ESPFIX_SS ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ DISABLE_INTERRUPTS(CLBR_EAX) ++ TRACE_IRQS_OFF ++ lss (%esp), %esp ++ CFI_ADJUST_CFA_OFFSET -8 ++ jmp restore_nocheck ++#else ++ ALIGN ++restore_all_enable_events: ++ TRACE_IRQS_ON ++ __ENABLE_INTERRUPTS ++scrit: /**** START OF CRITICAL REGION ****/ ++ __TEST_PENDING ++ jnz 14f # process more events if necessary... ++ RESTORE_REGS ++ addl $4, %esp ++ CFI_ADJUST_CFA_OFFSET -4 ++1: iret ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,iret_exc ++.previous ++14: __DISABLE_INTERRUPTS ++ TRACE_IRQS_OFF ++ jmp 11f ++ecrit: /**** END OF CRITICAL REGION ****/ ++ ++ CFI_RESTORE_STATE ++hypervisor_iret: ++ andl $~NMI_MASK, PT_EFLAGS(%esp) ++ RESTORE_REGS ++ addl $4, %esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp hypercall_page + (__HYPERVISOR_iret * 32) ++#endif ++ CFI_ENDPROC ++ ++ # perform work that needs to be done immediately before resumption ++ ALIGN ++ RING0_PTREGS_FRAME # can't unwind into user space anyway ++work_pending: ++ testb $_TIF_NEED_RESCHED, %cl ++ jz work_notifysig ++work_resched: ++ call schedule ++ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt ++ # setting need_resched or sigpending ++ # between sampling and the iret ++ TRACE_IRQS_OFF ++ movl TI_flags(%ebp), %ecx ++ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other ++ # than syscall tracing? ++ jz restore_all ++ testb $_TIF_NEED_RESCHED, %cl ++ jnz work_resched ++ ++work_notifysig: # deal with pending signals and ++ # notify-resume requests ++#ifdef CONFIG_VM86 ++ testl $VM_MASK, PT_EFLAGS(%esp) ++ movl %esp, %eax ++ jne work_notifysig_v86 # returning to kernel-space or ++ # vm86-space ++ xorl %edx, %edx ++ call do_notify_resume ++ jmp resume_userspace_sig ++ ++ ALIGN ++work_notifysig_v86: ++ pushl %ecx # save ti_flags for do_notify_resume ++ CFI_ADJUST_CFA_OFFSET 4 ++ call save_v86_state # %eax contains pt_regs pointer ++ popl %ecx ++ CFI_ADJUST_CFA_OFFSET -4 ++ movl %eax, %esp ++#else ++ movl %esp, %eax ++#endif ++ xorl %edx, %edx ++ call do_notify_resume ++ jmp resume_userspace_sig ++ ++ # perform syscall exit tracing ++ ALIGN ++syscall_trace_entry: ++ movl $-ENOSYS,PT_EAX(%esp) ++ movl %esp, %eax ++ xorl %edx,%edx ++ call do_syscall_trace ++ cmpl $0, %eax ++ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, ++ # so must skip actual syscall ++ movl PT_ORIG_EAX(%esp), %eax ++ cmpl $(nr_syscalls), %eax ++ jnae syscall_call ++ jmp syscall_exit ++ ++ # perform syscall exit tracing ++ ALIGN ++syscall_exit_work: ++ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl ++ jz work_pending ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call ++ # schedule() instead ++ movl %esp, %eax ++ movl $1, %edx ++ call do_syscall_trace ++ jmp resume_userspace ++ CFI_ENDPROC ++ ++ RING0_INT_FRAME # can't unwind into user space anyway ++syscall_fault: ++ pushl %eax # save orig_eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ GET_THREAD_INFO(%ebp) ++ movl $-EFAULT,PT_EAX(%esp) ++ jmp resume_userspace ++ ++syscall_badsys: ++ movl $-ENOSYS,PT_EAX(%esp) ++ jmp resume_userspace ++ CFI_ENDPROC ++ ++#ifndef CONFIG_XEN ++#define FIXUP_ESPFIX_STACK \ ++ /* since we are on a wrong stack, we cant make it a C code :( */ \ ++ movl %gs:PDA_cpu, %ebx; \ ++ PER_CPU(cpu_gdt_descr, %ebx); \ ++ movl GDS_address(%ebx), %ebx; \ ++ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ ++ addl %esp, %eax; \ ++ pushl $__KERNEL_DS; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ pushl %eax; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ lss (%esp), %esp; \ ++ CFI_ADJUST_CFA_OFFSET -8; ++#define UNWIND_ESPFIX_STACK \ ++ movl %ss, %eax; \ ++ /* see if on espfix stack */ \ ++ cmpw $__ESPFIX_SS, %ax; \ ++ jne 27f; \ ++ movl $__KERNEL_DS, %eax; \ ++ movl %eax, %ds; \ ++ movl %eax, %es; \ ++ /* switch to normal stack */ \ ++ FIXUP_ESPFIX_STACK; \ ++27:; ++ ++/* ++ * Build the entry stubs and pointer table with ++ * some assembler magic. ++ */ ++.data ++ENTRY(interrupt) ++.text ++ ++vector=0 ++ENTRY(irq_entries_start) ++ RING0_INT_FRAME ++.rept NR_IRQS ++ ALIGN ++ .if vector ++ CFI_ADJUST_CFA_OFFSET -4 ++ .endif ++1: pushl $~(vector) ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp common_interrupt ++.data ++ .long 1b ++.text ++vector=vector+1 ++.endr ++ ++/* ++ * the CPU automatically disables interrupts when executing an IRQ vector, ++ * so IRQ-flags tracing has to follow that: ++ */ ++ ALIGN ++common_interrupt: ++ SAVE_ALL ++ TRACE_IRQS_OFF ++ movl %esp,%eax ++ call do_IRQ ++ jmp ret_from_intr ++ CFI_ENDPROC ++ ++#define BUILD_INTERRUPT(name, nr) \ ++ENTRY(name) \ ++ RING0_INT_FRAME; \ ++ pushl $~(nr); \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ SAVE_ALL; \ ++ TRACE_IRQS_OFF \ ++ movl %esp,%eax; \ ++ call smp_/**/name; \ ++ jmp ret_from_intr; \ ++ CFI_ENDPROC ++ ++/* The include is where all of the SMP etc. interrupts come from */ ++#include "entry_arch.h" ++#else ++#define UNWIND_ESPFIX_STACK ++#endif ++ ++KPROBE_ENTRY(page_fault) ++ RING0_EC_FRAME ++ pushl $do_page_fault ++ CFI_ADJUST_CFA_OFFSET 4 ++ ALIGN ++error_code: ++ /* the function address is in %gs's slot on the stack */ ++ pushl %es ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET es, 0*/ ++ pushl %ds ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET ds, 0*/ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET eax, 0 ++ pushl %ebp ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ebp, 0 ++ pushl %edi ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET edi, 0 ++ pushl %esi ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET esi, 0 ++ pushl %edx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET edx, 0 ++ pushl %ecx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ecx, 0 ++ pushl %ebx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ebx, 0 ++ cld ++ pushl %gs ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET gs, 0*/ ++ movl $(__KERNEL_PDA), %ecx ++ movl %ecx, %gs ++ UNWIND_ESPFIX_STACK ++ popl %ecx ++ CFI_ADJUST_CFA_OFFSET -4 ++ /*CFI_REGISTER es, ecx*/ ++ movl PT_GS(%esp), %edi # get the function address ++ movl PT_ORIG_EAX(%esp), %edx # get the error code ++ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart ++ mov %ecx, PT_GS(%esp) ++ /*CFI_REL_OFFSET gs, ES*/ ++ movl $(__USER_DS), %ecx ++ movl %ecx, %ds ++ movl %ecx, %es ++ movl %esp,%eax # pt_regs pointer ++ call *%edi ++ jmp ret_from_exception ++ CFI_ENDPROC ++KPROBE_END(page_fault) ++ ++#ifdef CONFIG_XEN ++# A note on the "critical region" in our callback handler. ++# We want to avoid stacking callback handlers due to events occurring ++# during handling of the last event. To do this, we keep events disabled ++# until we've done all processing. HOWEVER, we must enable events before ++# popping the stack frame (can't be done atomically) and so it would still ++# be possible to get enough handler activations to overflow the stack. ++# Although unlikely, bugs of that kind are hard to track down, so we'd ++# like to avoid the possibility. ++# So, on entry to the handler we detect whether we interrupted an ++# existing activation in its critical region -- if so, we pop the current ++# activation and restart the handler using the previous one. ++# ++# The sysexit critical region is slightly different. sysexit ++# atomically removes the entire stack frame. If we interrupt in the ++# critical region we know that the entire frame is present and correct ++# so we can simply throw away the new one. ++ENTRY(hypervisor_callback) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ movl PT_EIP(%esp),%eax ++ cmpl $scrit,%eax ++ jb 11f ++ cmpl $ecrit,%eax ++ jb critical_region_fixup ++ cmpl $sysexit_scrit,%eax ++ jb 11f ++ cmpl $sysexit_ecrit,%eax ++ ja 11f ++ # interrupted in sysexit critical ++ addl $PT_OLDESP,%esp # Remove cs...ebx from stack frame. ++11: push %esp ++ CFI_ADJUST_CFA_OFFSET 4 ++ call evtchn_do_upcall ++ add $4,%esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp ret_from_intr ++ CFI_ENDPROC ++ ++# [How we do the fixup]. We want to merge the current stack frame with the ++# just-interrupted frame. How we do this depends on where in the critical ++# region the interrupted handler was executing, and so how many saved ++# registers are in each frame. We do this quickly using the lookup table ++# 'critical_fixup_table'. For each byte offset in the critical region, it ++# provides the number of bytes which have already been popped from the ++# interrupted stack frame. ++critical_region_fixup: ++ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped ++ cmpb $0xff,%cl # 0xff => vcpu_info critical region ++ jne 15f ++ xorl %ecx,%ecx ++15: leal (%esp,%ecx),%esi # %esi points at end of src region ++ leal PT_OLDESP(%esp),%edi # %edi points at end of dst region ++ shrl $2,%ecx # convert words to bytes ++ je 17f # skip loop if nothing to copy ++16: subl $4,%esi # pre-decrementing copy loop ++ subl $4,%edi ++ movl (%esi),%eax ++ movl %eax,(%edi) ++ loop 16b ++17: movl %edi,%esp # final %edi is top of merged stack ++ jmp 11b ++ ++.section .rodata,"a" ++critical_fixup_table: ++ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING ++ .byte 0xff,0xff # jnz 14f ++ .byte 0x00 # pop %ebx ++ .byte 0x04 # pop %ecx ++ .byte 0x08 # pop %edx ++ .byte 0x0c # pop %esi ++ .byte 0x10 # pop %edi ++ .byte 0x14 # pop %ebp ++ .byte 0x18 # pop %eax ++ .byte 0x1c # pop %ds ++ .byte 0x20 # pop %es ++ .byte 0x24,0x24 # pop %gs ++ .byte 0x28,0x28,0x28 # add $4,%esp ++ .byte 0x2c # iret ++ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) ++ .byte 0x00,0x00 # jmp 11b ++.previous ++ ++# Hypervisor uses this for application faults while it executes. ++# We get here for two reasons: ++# 1. Fault while reloading DS, ES, FS or GS ++# 2. Fault while executing IRET ++# Category 1 we fix up by reattempting the load, and zeroing the segment ++# register if the load fails. ++# Category 2 we fix up by jumping to do_iret_error. We cannot use the ++# normal Linux return path in this case because if we use the IRET hypercall ++# to pop the stack frame we end up in an infinite loop of failsafe callbacks. ++# We distinguish between categories by maintaining a status value in EAX. ++ENTRY(failsafe_callback) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ movl $1,%eax ++1: mov 4(%esp),%ds ++2: mov 8(%esp),%es ++3: mov 12(%esp),%fs ++4: mov 16(%esp),%gs ++ testl %eax,%eax ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ jz 5f ++ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) ++ jmp iret_exc ++5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) ++ pushl $0 ++ SAVE_ALL ++ jmp ret_from_exception ++.section .fixup,"ax"; \ ++6: xorl %eax,%eax; \ ++ movl %eax,4(%esp); \ ++ jmp 1b; \ ++7: xorl %eax,%eax; \ ++ movl %eax,8(%esp); \ ++ jmp 2b; \ ++8: xorl %eax,%eax; \ ++ movl %eax,12(%esp); \ ++ jmp 3b; \ ++9: xorl %eax,%eax; \ ++ movl %eax,16(%esp); \ ++ jmp 4b; \ ++.previous; \ ++.section __ex_table,"a"; \ ++ .align 4; \ ++ .long 1b,6b; \ ++ .long 2b,7b; \ ++ .long 3b,8b; \ ++ .long 4b,9b; \ ++.previous ++#endif ++ CFI_ENDPROC ++ ++ENTRY(coprocessor_error) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_coprocessor_error ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(simd_coprocessor_error) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_simd_coprocessor_error ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(device_not_available) ++ RING0_INT_FRAME ++ pushl $-1 # mark this as an int ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++#ifndef CONFIG_XEN ++ GET_CR0_INTO_EAX ++ testl $0x4, %eax # EM (math emulation bit) ++ je device_available_emulate ++ pushl $0 # temporary storage for ORIG_EIP ++ CFI_ADJUST_CFA_OFFSET 4 ++ call math_emulate ++ addl $4, %esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp ret_from_exception ++device_available_emulate: ++#endif ++ preempt_stop(CLBR_ANY) ++ call math_state_restore ++ jmp ret_from_exception ++ CFI_ENDPROC ++ ++#ifndef CONFIG_XEN ++/* ++ * Debug traps and NMI can happen at the one SYSENTER instruction ++ * that sets up the real kernel stack. Check here, since we can't ++ * allow the wrong stack to be used. ++ * ++ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have ++ * already pushed 3 words if it hits on the sysenter instruction: ++ * eflags, cs and eip. ++ * ++ * We just load the right stack, and push the three (known) values ++ * by hand onto the new stack - while updating the return eip past ++ * the instruction that would have done it for sysenter. ++ */ ++#define FIX_STACK(offset, ok, label) \ ++ cmpw $__KERNEL_CS,4(%esp); \ ++ jne ok; \ ++label: \ ++ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ ++ CFI_DEF_CFA esp, 0; \ ++ CFI_UNDEFINED eip; \ ++ pushfl; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ pushl $__KERNEL_CS; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ pushl $sysenter_past_esp; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ CFI_REL_OFFSET eip, 0 ++#endif /* CONFIG_XEN */ ++ ++KPROBE_ENTRY(debug) ++ RING0_INT_FRAME ++#ifndef CONFIG_XEN ++ cmpl $sysenter_entry,(%esp) ++ jne debug_stack_correct ++ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) ++debug_stack_correct: ++#endif /* !CONFIG_XEN */ ++ pushl $-1 # mark this as an int ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # error code 0 ++ movl %esp,%eax # pt_regs pointer ++ call do_debug ++ jmp ret_from_exception ++ CFI_ENDPROC ++KPROBE_END(debug) ++ .previous .text ++ ++#ifndef CONFIG_XEN ++/* ++ * NMI is doubly nasty. It can happen _while_ we're handling ++ * a debug fault, and the debug fault hasn't yet been able to ++ * clear up the stack. So we first check whether we got an ++ * NMI on the sysenter entry path, but after that we need to ++ * check whether we got an NMI on the debug path where the debug ++ * fault happened on the sysenter path. ++ */ ++KPROBE_ENTRY(nmi) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ movl %ss, %eax ++ cmpw $__ESPFIX_SS, %ax ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ je nmi_espfix_stack ++ cmpl $sysenter_entry,(%esp) ++ je nmi_stack_fixup ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ movl %esp,%eax ++ /* Do not access memory above the end of our stack page, ++ * it might not exist. ++ */ ++ andl $(THREAD_SIZE-1),%eax ++ cmpl $(THREAD_SIZE-20),%eax ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ jae nmi_stack_correct ++ cmpl $sysenter_entry,12(%esp) ++ je nmi_debug_stack_check ++nmi_stack_correct: ++ /* We have a RING0_INT_FRAME here */ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # zero error code ++ movl %esp,%eax # pt_regs pointer ++ call do_nmi ++ jmp restore_nocheck_notrace ++ CFI_ENDPROC ++ ++nmi_stack_fixup: ++ RING0_INT_FRAME ++ FIX_STACK(12,nmi_stack_correct, 1) ++ jmp nmi_stack_correct ++ ++nmi_debug_stack_check: ++ /* We have a RING0_INT_FRAME here */ ++ cmpw $__KERNEL_CS,16(%esp) ++ jne nmi_stack_correct ++ cmpl $debug,(%esp) ++ jb nmi_stack_correct ++ cmpl $debug_esp_fix_insn,(%esp) ++ ja nmi_stack_correct ++ FIX_STACK(24,nmi_stack_correct, 1) ++ jmp nmi_stack_correct ++ ++nmi_espfix_stack: ++ /* We have a RING0_INT_FRAME here. ++ * ++ * create the pointer to lss back ++ */ ++ pushl %ss ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl %esp ++ CFI_ADJUST_CFA_OFFSET 4 ++ addw $4, (%esp) ++ /* copy the iret frame of 12 bytes */ ++ .rept 3 ++ pushl 16(%esp) ++ CFI_ADJUST_CFA_OFFSET 4 ++ .endr ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ FIXUP_ESPFIX_STACK # %eax == %esp ++ xorl %edx,%edx # zero error code ++ call do_nmi ++ RESTORE_REGS ++ lss 12+4(%esp), %esp # back to espfix stack ++ CFI_ADJUST_CFA_OFFSET -24 ++1: INTERRUPT_RETURN ++ CFI_ENDPROC ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,iret_exc ++.previous ++KPROBE_END(nmi) ++#else ++KPROBE_ENTRY(nmi) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # zero error code ++ movl %esp,%eax # pt_regs pointer ++ call do_nmi ++ orl $NMI_MASK, PT_EFLAGS(%esp) ++ jmp restore_all ++ CFI_ENDPROC ++KPROBE_END(nmi) ++#endif ++ ++#ifdef CONFIG_PARAVIRT ++ENTRY(native_iret) ++1: iret ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,iret_exc ++.previous ++ ++ENTRY(native_irq_enable_sysexit) ++ sti ++ sysexit ++#endif ++ ++KPROBE_ENTRY(int3) ++ RING0_INT_FRAME ++ pushl $-1 # mark this as an int ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # zero error code ++ movl %esp,%eax # pt_regs pointer ++ call do_int3 ++ jmp ret_from_exception ++ CFI_ENDPROC ++KPROBE_END(int3) ++ ++ENTRY(overflow) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_overflow ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(bounds) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_bounds ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(invalid_op) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_invalid_op ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(coprocessor_segment_overrun) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_coprocessor_segment_overrun ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(invalid_TSS) ++ RING0_EC_FRAME ++ pushl $do_invalid_TSS ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(segment_not_present) ++ RING0_EC_FRAME ++ pushl $do_segment_not_present ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(stack_segment) ++ RING0_EC_FRAME ++ pushl $do_stack_segment ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++KPROBE_ENTRY(general_protection) ++ RING0_EC_FRAME ++ pushl $do_general_protection ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++KPROBE_END(general_protection) ++ ++ENTRY(alignment_check) ++ RING0_EC_FRAME ++ pushl $do_alignment_check ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++ENTRY(divide_error) ++ RING0_INT_FRAME ++ pushl $0 # no error code ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_divide_error ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++#ifdef CONFIG_X86_MCE ++ENTRY(machine_check) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl machine_check_vector ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++#endif ++ ++#ifndef CONFIG_XEN ++ENTRY(spurious_interrupt_bug) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_spurious_interrupt_bug ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++#endif /* !CONFIG_XEN */ ++ ++ENTRY(kernel_thread_helper) ++ pushl $0 # fake return address for unwinder ++ CFI_STARTPROC ++ movl %edx,%eax ++ push %edx ++ CFI_ADJUST_CFA_OFFSET 4 ++ call *%ebx ++ push %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ call do_exit ++ CFI_ENDPROC ++ENDPROC(kernel_thread_helper) ++ ++ENTRY(fixup_4gb_segment) ++ RING0_EC_FRAME ++ pushl $do_fixup_4gb_segment ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++ ++.section .rodata,"a" ++.align 4 ++#include "syscall_table.S" ++ ++syscall_table_size=(.-sys_call_table) +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/entry.S +--- a/arch/i386/kernel/entry.S Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/entry.S Wed Aug 08 16:25:28 2007 -0300 +@@ -284,7 +284,7 @@ ENTRY(sysenter_entry) + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp +- movl TSS_sysenter_esp0(%esp),%esp ++ movl SYSENTER_stack_esp0(%esp),%esp + sysenter_past_esp: + /* + * No need to follow this irqs on/off section: the syscall +@@ -727,7 +727,7 @@ device_not_available_emulate: + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * +- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have ++ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * +@@ -739,7 +739,7 @@ device_not_available_emulate: + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ + label: \ +- movl TSS_sysenter_esp0+offset(%esp),%esp; \ ++ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/fixup.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/fixup.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,95 @@ ++/****************************************************************************** ++ * fixup.c ++ * ++ * Binary-rewriting of certain IA32 instructions, on notification by Xen. ++ * Used to avoid repeated slow emulation of common instructions used by the ++ * user-space TLS (Thread-Local Storage) libraries. ++ * ++ * **** NOTE **** ++ * Issues with the binary rewriting have caused it to be removed. Instead ++ * we rely on Xen's emulator to boot the kernel, and then print a banner ++ * message recommending that the user disables /lib/tls. ++ * ++ * Copyright (c) 2004, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) ++ ++fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) ++{ ++#if 0 ++ static unsigned long printed = 0; ++ char info[100]; ++ int i; ++ ++ /* Ignore statically-linked init. */ ++ if (current->tgid == 1) ++ return; ++ ++ HYPERVISOR_vm_assist( ++ VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify); ++ ++ if (test_and_set_bit(0, &printed)) ++ return; ++ ++ sprintf(info, "%s (pid=%d)", current->comm, current->tgid); ++ ++ DP(""); ++ DP("***************************************************************"); ++ DP("***************************************************************"); ++ DP("** WARNING: Currently emulating unsupported memory accesses **"); ++ DP("** in /lib/tls glibc libraries. The emulation is **"); ++ DP("** slow. To ensure full performance you should **"); ++ DP("** install a 'xen-friendly' (nosegneg) version of **"); ++ DP("** the library, or disable tls support by executing **"); ++ DP("** the following as root: **"); ++ DP("** mv /lib/tls /lib/tls.disabled **"); ++ DP("** Offending process: %-38.38s **", info); ++ DP("***************************************************************"); ++ DP("***************************************************************"); ++ DP(""); ++ ++ for (i = 5; i > 0; i--) { ++ touch_softlockup_watchdog(); ++ printk("Pausing... %d", i); ++ mdelay(1000); ++ printk("\b\b\b\b\b\b\b\b\b\b\b\b"); ++ } ++ ++ printk("Continuing...\n\n"); ++#else ++ if (printk_ratelimit()) ++ printk(KERN_WARNING ++ "4gb seg fixup, process %s (pid %d), cs:ip %02x:%08lx\n", ++ current->comm, current->tgid, regs->xcs, regs->eip); ++#endif ++} ++ ++static int __init fixup_init(void) ++{ ++ HYPERVISOR_vm_assist( ++ VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); ++ return 0; ++} ++__initcall(fixup_init); +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/head-xen.S +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/head-xen.S Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,292 @@ ++ ++ ++.text ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * References to members of the new_cpu_data structure. ++ */ ++ ++#define X86 new_cpu_data+CPUINFO_x86 ++#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor ++#define X86_MODEL new_cpu_data+CPUINFO_x86_model ++#define X86_MASK new_cpu_data+CPUINFO_x86_mask ++#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math ++#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level ++#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability ++#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id ++ ++#define VIRT_ENTRY_OFFSET 0x0 ++.org VIRT_ENTRY_OFFSET ++ENTRY(startup_32) ++ ++#ifdef CONFIG_PARAVIRT ++ movl %cs, %eax ++ testl $0x3, %eax ++ jnz startup_paravirt ++#endif ++ ++ movl %esi,xen_start_info ++ cld ++ ++ call setup_pda ++ ++ /* Set up the stack pointer */ ++ movl $(init_thread_union+THREAD_SIZE),%esp ++ ++ /* get vendor info */ ++ xorl %eax,%eax # call CPUID with 0 -> return vendor ID ++ XEN_CPUID ++ movl %eax,X86_CPUID # save CPUID level ++ movl %ebx,X86_VENDOR_ID # lo 4 chars ++ movl %edx,X86_VENDOR_ID+4 # next 4 chars ++ movl %ecx,X86_VENDOR_ID+8 # last 4 chars ++ ++ movl $1,%eax # Use the CPUID instruction to get CPU type ++ XEN_CPUID ++ movb %al,%cl # save reg for future use ++ andb $0x0f,%ah # mask processor family ++ movb %ah,X86 ++ andb $0xf0,%al # mask model ++ shrb $4,%al ++ movb %al,X86_MODEL ++ andb $0x0f,%cl # mask mask revision ++ movb %cl,X86_MASK ++ movl %edx,X86_CAPABILITY ++ ++ movb $1,X86_HARD_MATH ++ ++ xorl %eax,%eax # Clear FS and LDT ++ movl %eax,%fs ++ ++ movl $(__KERNEL_PDA),%eax ++ mov %eax,%gs ++ ++ cld # gcc2 wants the direction flag cleared at all times ++ ++ pushl %eax # fake return address ++ jmp start_kernel ++ ++/* ++ * Point the GDT at this CPU's PDA. This will be ++ * cpu_gdt_table and boot_pda. ++ */ ++setup_pda: ++ /* get the PDA pointer */ ++ movl $boot_pda, %eax ++ ++ /* slot the PDA address into the GDT */ ++ mov $cpu_gdt_table, %ecx ++ mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ ++ shr $16, %eax ++ mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ ++ mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ ++ ++ # %esi still points to start_info, and no registers ++ # need to be preserved. ++ ++ movl XEN_START_mfn_list(%esi), %ebx ++ movl $(cpu_gdt_table - __PAGE_OFFSET), %eax ++ shrl $PAGE_SHIFT, %eax ++ movl (%ebx,%eax,4), %ecx ++ pushl %ecx # frame number for set_gdt below ++ ++ xorl %esi, %esi ++ xorl %edx, %edx ++ shldl $PAGE_SHIFT, %ecx, %edx ++ shll $PAGE_SHIFT, %ecx ++ orl $0x61, %ecx ++ movl $cpu_gdt_table, %ebx ++ movl $__HYPERVISOR_update_va_mapping, %eax ++ int $0x82 ++ ++ movl $(PAGE_SIZE_asm / 8), %ecx ++ movl %esp, %ebx ++ movl $__HYPERVISOR_set_gdt, %eax ++ int $0x82 ++ ++ popl %ecx ++ ret ++ ++#define HYPERCALL_PAGE_OFFSET 0x1000 ++.org HYPERCALL_PAGE_OFFSET ++ENTRY(hypercall_page) ++ CFI_STARTPROC ++.skip 0x1000 ++ CFI_ENDPROC ++ ++/* ++ * Real beginning of normal "text" segment ++ */ ++ENTRY(stext) ++ENTRY(_stext) ++ ++/* ++ * BSS section ++ */ ++.section ".bss.page_aligned","w" ++ENTRY(empty_zero_page) ++ .fill 4096,1,0 ++ ++/* ++ * This starts the data section. ++ */ ++.data ++ENTRY(start_pda) ++ .long boot_pda ++ ++#ifdef CONFIG_PARAVIRT ++startup_paravirt: ++ cld ++ movl $(init_thread_union+THREAD_SIZE),%esp ++ ++ /* We take pains to preserve all the regs. */ ++ pushl %edx ++ pushl %ecx ++ pushl %eax ++ ++ /* paravirt.o is last in link, and that probe fn never returns */ ++ pushl $__start_paravirtprobe ++1: ++ movl 0(%esp), %eax ++ pushl (%eax) ++ movl 8(%esp), %eax ++ call *(%esp) ++ popl %eax ++ ++ movl 4(%esp), %eax ++ movl 8(%esp), %ecx ++ movl 12(%esp), %edx ++ ++ addl $4, (%esp) ++ jmp 1b ++#endif ++ ++/* ++ * The Global Descriptor Table contains 28 quadwords, per-CPU. ++ */ ++ .section .data.page_aligned, "aw" ++ .align PAGE_SIZE_asm ++ENTRY(cpu_gdt_table) ++ .quad 0x0000000000000000 /* NULL descriptor */ ++ .quad 0x0000000000000000 /* 0x0b reserved */ ++ .quad 0x0000000000000000 /* 0x13 reserved */ ++ .quad 0x0000000000000000 /* 0x1b reserved */ ++ .quad 0x0000000000000000 /* 0x20 unused */ ++ .quad 0x0000000000000000 /* 0x28 unused */ ++ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ ++ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ ++ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ ++ .quad 0x0000000000000000 /* 0x4b reserved */ ++ .quad 0x0000000000000000 /* 0x53 reserved */ ++ .quad 0x0000000000000000 /* 0x5b reserved */ ++ ++ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ ++ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ ++ .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ ++ .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ ++ ++ .quad 0x0000000000000000 /* 0x80 TSS descriptor */ ++ .quad 0x0000000000000000 /* 0x88 LDT descriptor */ ++ ++ /* ++ * Segments used for calling PnP BIOS have byte granularity. ++ * They code segments and data segments have fixed 64k limits, ++ * the transfer segment sizes are set at run time. ++ */ ++ .quad 0x0000000000000000 /* 0x90 32-bit code */ ++ .quad 0x0000000000000000 /* 0x98 16-bit code */ ++ .quad 0x0000000000000000 /* 0xa0 16-bit data */ ++ .quad 0x0000000000000000 /* 0xa8 16-bit data */ ++ .quad 0x0000000000000000 /* 0xb0 16-bit data */ ++ ++ /* ++ * The APM segments have byte granularity and their bases ++ * are set at run time. All have 64k limits. ++ */ ++ .quad 0x0000000000000000 /* 0xb8 APM CS code */ ++ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ ++ .quad 0x0000000000000000 /* 0xc8 APM DS data */ ++ ++ .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */ ++ .quad 0x00cf92000000ffff /* 0xd8 - PDA */ ++ .quad 0x0000000000000000 /* 0xe0 - unused */ ++ .quad 0x0000000000000000 /* 0xe8 - unused */ ++ .quad 0x0000000000000000 /* 0xf0 - unused */ ++ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ ++ .align PAGE_SIZE_asm ++ ++#if CONFIG_XEN_COMPAT <= 0x030002 ++/* ++ * __xen_guest information ++ */ ++.macro utoa value ++ .if (\value) < 0 || (\value) >= 0x10 ++ utoa (((\value)>>4)&0x0fffffff) ++ .endif ++ .if ((\value) & 0xf) < 10 ++ .byte '0' + ((\value) & 0xf) ++ .else ++ .byte 'A' + ((\value) & 0xf) - 10 ++ .endif ++.endm ++ ++.section __xen_guest ++ .ascii "GUEST_OS=linux,GUEST_VER=2.6" ++ .ascii ",XEN_VER=xen-3.0" ++ .ascii ",VIRT_BASE=0x" ++ utoa __PAGE_OFFSET ++ .ascii ",ELF_PADDR_OFFSET=0x" ++ utoa __PAGE_OFFSET ++ .ascii ",VIRT_ENTRY=0x" ++ utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) ++ .ascii ",HYPERCALL_PAGE=0x" ++ utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) ++ .ascii ",FEATURES=writable_page_tables" ++ .ascii "|writable_descriptor_tables" ++ .ascii "|auto_translated_physmap" ++ .ascii "|pae_pgdir_above_4gb" ++ .ascii "|supervisor_mode_kernel" ++#ifdef CONFIG_X86_PAE ++ .ascii ",PAE=yes[extended-cr3]" ++#else ++ .ascii ",PAE=no" ++#endif ++ .ascii ",LOADER=generic" ++ .byte 0 ++#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ ++ ++ ++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") ++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") ++ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") ++ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) ++#if CONFIG_XEN_COMPAT <= 0x030002 ++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET) ++#else ++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0) ++#endif ++ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) ++ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) ++ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) ++ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") ++#ifdef CONFIG_X86_PAE ++ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") ++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) ++#else ++ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") ++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT) ++#endif ++ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") ++ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/init_task.c +--- a/arch/i386/kernel/init_task.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/init_task.c Wed Aug 08 16:25:28 2007 -0300 +@@ -14,7 +14,14 @@ static struct files_struct init_files = + static struct files_struct init_files = INIT_FILES; + static struct signal_struct init_signals = INIT_SIGNALS(init_signals); + static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); ++ ++#ifdef CONFIG_XEN ++#define swapper_pg_dir ((pgd_t *)NULL) + struct mm_struct init_mm = INIT_MM(init_mm); ++#undef swapper_pg_dir ++#else ++struct mm_struct init_mm = INIT_MM(init_mm); ++#endif + + EXPORT_SYMBOL(init_mm); + +@@ -38,9 +45,11 @@ struct task_struct init_task = INIT_TASK + + EXPORT_SYMBOL(init_task); + ++#ifndef CONFIG_X86_NO_TSS + /* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. + */ + DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; ++#endif + +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/io_apic-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/io_apic-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,2973 @@ ++/* ++ * Intel IO-APIC support for multi-Pentium hosts. ++ * ++ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo ++ * ++ * Many thanks to Stig Venaas for trying out countless experimental ++ * patches and reporting/debugging problems patiently! ++ * ++ * (c) 1999, Multiple IO-APIC support, developed by ++ * Ken-ichi Yaku and ++ * Hidemi Kishimoto , ++ * further tested and cleaned up by Zach Brown ++ * and Ingo Molnar ++ * ++ * Fixes ++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; ++ * thanks to Eric Gilmore ++ * and Rolf G. Tews ++ * for testing these extensively ++ * Paul Diefenbaugh : Added full ACPI support ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "io_ports.h" ++ ++#ifdef CONFIG_XEN ++ ++#include ++#include ++ ++/* Fake i8259 */ ++#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) ++#define disable_8259A_irq(_irq) ((void)0) ++#define i8259A_irq_pending(_irq) (0) ++ ++unsigned long io_apic_irqs; ++ ++static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) ++{ ++ struct physdev_apic apic_op; ++ int ret; ++ ++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; ++ apic_op.reg = reg; ++ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); ++ if (ret) ++ return ret; ++ return apic_op.value; ++} ++ ++static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++ struct physdev_apic apic_op; ++ ++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; ++ apic_op.reg = reg; ++ apic_op.value = value; ++ HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op); ++} ++ ++#define io_apic_read(a,r) xen_io_apic_read(a,r) ++#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) ++ ++#endif /* CONFIG_XEN */ ++ ++int (*ioapic_renumber_irq)(int ioapic, int irq); ++atomic_t irq_mis_count; ++ ++/* Where if anywhere is the i8259 connect in external int mode */ ++static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; ++ ++static DEFINE_SPINLOCK(ioapic_lock); ++static DEFINE_SPINLOCK(vector_lock); ++ ++int timer_over_8254 __initdata = 1; ++ ++/* ++ * Is the SiS APIC rmw bug present ? ++ * -1 = don't know, 0 = no, 1 = yes ++ */ ++int sis_apic_bug = -1; ++ ++/* ++ * # of IRQ routing registers ++ */ ++int nr_ioapic_registers[MAX_IO_APICS]; ++ ++static int disable_timer_pin_1 __initdata; ++ ++/* ++ * Rough estimation of how many shared IRQs there are, can ++ * be changed anytime. ++ */ ++#define MAX_PLUS_SHARED_IRQS NR_IRQS ++#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) ++ ++/* ++ * This is performance-critical, we want to do it O(1) ++ * ++ * the indexing order of this array favors 1:1 mappings ++ * between pins and IRQs. ++ */ ++ ++static struct irq_pin_list { ++ int apic, pin, next; ++} irq_2_pin[PIN_MAP_SIZE]; ++ ++#ifndef CONFIG_XEN ++struct io_apic { ++ unsigned int index; ++ unsigned int unused[3]; ++ unsigned int data; ++}; ++ ++static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) ++{ ++ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) ++ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); ++} ++ ++static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ return readl(&io_apic->data); ++} ++ ++static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++} ++ ++/* ++ * Re-write a value: to be used for read-modify-write ++ * cycles where the read already set up the index register. ++ * ++ * Older SiS APIC requires we rewrite the index register ++ */ ++static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++ volatile struct io_apic *io_apic = io_apic_base(apic); ++ if (sis_apic_bug) ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++} ++#endif /* !CONFIG_XEN */ ++ ++union entry_union { ++ struct { u32 w1, w2; }; ++ struct IO_APIC_route_entry entry; ++}; ++ ++static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) ++{ ++ union entry_union eu; ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); ++ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ return eu.entry; ++} ++ ++/* ++ * When we write a new IO APIC routing entry, we need to write the high ++ * word first! If the mask bit in the low word is clear, we will enable ++ * the interrupt, and we need to make sure the entry is fully populated ++ * before that happens. ++ */ ++static void ++__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ union entry_union eu; ++ eu.entry = e; ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++} ++ ++static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __ioapic_write_entry(apic, pin, e); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++/* ++ * When we mask an IO APIC routing entry, we need to write the low ++ * word first, in order to set the mask bit before we change the ++ * high bits! ++ */ ++ ++#ifndef CONFIG_XEN ++static void ioapic_mask_entry(int apic, int pin) ++{ ++ unsigned long flags; ++ union entry_union eu = { .entry.mask = 1 }; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++#endif ++ ++/* ++ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are ++ * shared ISA-space IRQs, so we have to support them. We are super ++ * fast in the common case, and fast for shared ISA-space IRQs. ++ */ ++static void add_pin_to_irq(unsigned int irq, int apic, int pin) ++{ ++ static int first_free_entry = NR_IRQS; ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ ++ while (entry->next) ++ entry = irq_2_pin + entry->next; ++ ++ if (entry->pin != -1) { ++ entry->next = first_free_entry; ++ entry = irq_2_pin + entry->next; ++ if (++first_free_entry >= PIN_MAP_SIZE) ++ panic("io_apic.c: whoops"); ++ } ++ entry->apic = apic; ++ entry->pin = pin; ++} ++ ++#ifdef CONFIG_XEN ++#define clear_IO_APIC() ((void)0) ++#else ++/* ++ * Reroute an IRQ to a different pin. ++ */ ++static void __init replace_pin_at_irq(unsigned int irq, ++ int oldapic, int oldpin, ++ int newapic, int newpin) ++{ ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ ++ while (1) { ++ if (entry->apic == oldapic && entry->pin == oldpin) { ++ entry->apic = newapic; ++ entry->pin = newpin; ++ } ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++} ++ ++static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) ++{ ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ unsigned int pin, reg; ++ ++ for (;;) { ++ pin = entry->pin; ++ if (pin == -1) ++ break; ++ reg = io_apic_read(entry->apic, 0x10 + pin*2); ++ reg &= ~disable; ++ reg |= enable; ++ io_apic_modify(entry->apic, 0x10 + pin*2, reg); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++} ++ ++/* mask = 1 */ ++static void __mask_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0x00010000, 0); ++} ++ ++/* mask = 0 */ ++static void __unmask_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0, 0x00010000); ++} ++ ++/* mask = 1, trigger = 0 */ ++static void __mask_and_edge_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); ++} ++ ++/* mask = 0, trigger = 1 */ ++static void __unmask_and_level_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); ++} ++ ++static void mask_IO_APIC_irq (unsigned int irq) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __mask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void unmask_IO_APIC_irq (unsigned int irq) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __unmask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ /* Check delivery_mode to be sure we're not clearing an SMI pin */ ++ entry = ioapic_read_entry(apic, pin); ++ if (entry.delivery_mode == dest_SMI) ++ return; ++ ++ /* ++ * Disable it in the IO-APIC irq-routing table: ++ */ ++ ioapic_mask_entry(apic, pin); ++} ++ ++static void clear_IO_APIC (void) ++{ ++ int apic, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ++ clear_IO_APIC_pin(apic, pin); ++} ++ ++#ifdef CONFIG_SMP ++static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) ++{ ++ unsigned long flags; ++ int pin; ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ unsigned int apicid_value; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, cpumask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ tmp = TARGET_CPUS; ++ ++ cpus_and(cpumask, tmp, CPU_MASK_ALL); ++ ++ apicid_value = cpu_mask_to_apicid(cpumask); ++ /* Prepare to do the io_apic_write */ ++ apicid_value = apicid_value << 24; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ for (;;) { ++ pin = entry->pin; ++ if (pin == -1) ++ break; ++ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++ set_native_irq_info(irq, cpumask); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++#if defined(CONFIG_IRQBALANCE) ++# include /* kernel_thread() */ ++# include /* kstat */ ++# include /* kmalloc() */ ++# include /* time_after() */ ++ ++#ifdef CONFIG_BALANCED_IRQ_DEBUG ++# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) ++# define Dprintk(x...) do { TDprintk(x); } while (0) ++# else ++# define TDprintk(x...) ++# define Dprintk(x...) ++# endif ++ ++#define IRQBALANCE_CHECK_ARCH -999 ++#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) ++#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) ++#define BALANCED_IRQ_MORE_DELTA (HZ/10) ++#define BALANCED_IRQ_LESS_DELTA (HZ) ++ ++static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; ++static int physical_balance __read_mostly; ++static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; ++ ++static struct irq_cpu_info { ++ unsigned long * last_irq; ++ unsigned long * irq_delta; ++ unsigned long irq; ++} irq_cpu_data[NR_CPUS]; ++ ++#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) ++#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) ++#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) ++ ++#define IDLE_ENOUGH(cpu,now) \ ++ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) ++ ++#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) ++ ++#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) ++ ++static cpumask_t balance_irq_affinity[NR_IRQS] = { ++ [0 ... NR_IRQS-1] = CPU_MASK_ALL ++}; ++ ++void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ balance_irq_affinity[irq] = mask; ++} ++ ++static unsigned long move(int curr_cpu, cpumask_t allowed_mask, ++ unsigned long now, int direction) ++{ ++ int search_idle = 1; ++ int cpu = curr_cpu; ++ ++ goto inside; ++ ++ do { ++ if (unlikely(cpu == curr_cpu)) ++ search_idle = 0; ++inside: ++ if (direction == 1) { ++ cpu++; ++ if (cpu >= NR_CPUS) ++ cpu = 0; ++ } else { ++ cpu--; ++ if (cpu == -1) ++ cpu = NR_CPUS-1; ++ } ++ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || ++ (search_idle && !IDLE_ENOUGH(cpu,now))); ++ ++ return cpu; ++} ++ ++static inline void balance_irq(int cpu, int irq) ++{ ++ unsigned long now = jiffies; ++ cpumask_t allowed_mask; ++ unsigned int new_cpu; ++ ++ if (irqbalance_disabled) ++ return; ++ ++ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); ++ new_cpu = move(cpu, allowed_mask, now, 1); ++ if (cpu != new_cpu) { ++ set_pending_irq(irq, cpumask_of_cpu(new_cpu)); ++ } ++} ++ ++static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) ++{ ++ int i, j; ++ Dprintk("Rotating IRQs among CPUs.\n"); ++ for_each_online_cpu(i) { ++ for (j = 0; j < NR_IRQS; j++) { ++ if (!irq_desc[j].action) ++ continue; ++ /* Is it a significant load ? */ ++ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < ++ useful_load_threshold) ++ continue; ++ balance_irq(i, j); ++ } ++ } ++ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, ++ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); ++ return; ++} ++ ++static void do_irq_balance(void) ++{ ++ int i, j; ++ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); ++ unsigned long move_this_load = 0; ++ int max_loaded = 0, min_loaded = 0; ++ int load; ++ unsigned long useful_load_threshold = balanced_irq_interval + 10; ++ int selected_irq; ++ int tmp_loaded, first_attempt = 1; ++ unsigned long tmp_cpu_irq; ++ unsigned long imbalance = 0; ++ cpumask_t allowed_mask, target_cpu_mask, tmp; ++ ++ for_each_possible_cpu(i) { ++ int package_index; ++ CPU_IRQ(i) = 0; ++ if (!cpu_online(i)) ++ continue; ++ package_index = CPU_TO_PACKAGEINDEX(i); ++ for (j = 0; j < NR_IRQS; j++) { ++ unsigned long value_now, delta; ++ /* Is this an active IRQ? */ ++ if (!irq_desc[j].action) ++ continue; ++ if ( package_index == i ) ++ IRQ_DELTA(package_index,j) = 0; ++ /* Determine the total count per processor per IRQ */ ++ value_now = (unsigned long) kstat_cpu(i).irqs[j]; ++ ++ /* Determine the activity per processor per IRQ */ ++ delta = value_now - LAST_CPU_IRQ(i,j); ++ ++ /* Update last_cpu_irq[][] for the next time */ ++ LAST_CPU_IRQ(i,j) = value_now; ++ ++ /* Ignore IRQs whose rate is less than the clock */ ++ if (delta < useful_load_threshold) ++ continue; ++ /* update the load for the processor or package total */ ++ IRQ_DELTA(package_index,j) += delta; ++ ++ /* Keep track of the higher numbered sibling as well */ ++ if (i != package_index) ++ CPU_IRQ(i) += delta; ++ /* ++ * We have sibling A and sibling B in the package ++ * ++ * cpu_irq[A] = load for cpu A + load for cpu B ++ * cpu_irq[B] = load for cpu B ++ */ ++ CPU_IRQ(package_index) += delta; ++ } ++ } ++ /* Find the least loaded processor package */ ++ for_each_online_cpu(i) { ++ if (i != CPU_TO_PACKAGEINDEX(i)) ++ continue; ++ if (min_cpu_irq > CPU_IRQ(i)) { ++ min_cpu_irq = CPU_IRQ(i); ++ min_loaded = i; ++ } ++ } ++ max_cpu_irq = ULONG_MAX; ++ ++tryanothercpu: ++ /* Look for heaviest loaded processor. ++ * We may come back to get the next heaviest loaded processor. ++ * Skip processors with trivial loads. ++ */ ++ tmp_cpu_irq = 0; ++ tmp_loaded = -1; ++ for_each_online_cpu(i) { ++ if (i != CPU_TO_PACKAGEINDEX(i)) ++ continue; ++ if (max_cpu_irq <= CPU_IRQ(i)) ++ continue; ++ if (tmp_cpu_irq < CPU_IRQ(i)) { ++ tmp_cpu_irq = CPU_IRQ(i); ++ tmp_loaded = i; ++ } ++ } ++ ++ if (tmp_loaded == -1) { ++ /* In the case of small number of heavy interrupt sources, ++ * loading some of the cpus too much. We use Ingo's original ++ * approach to rotate them around. ++ */ ++ if (!first_attempt && imbalance >= useful_load_threshold) { ++ rotate_irqs_among_cpus(useful_load_threshold); ++ return; ++ } ++ goto not_worth_the_effort; ++ } ++ ++ first_attempt = 0; /* heaviest search */ ++ max_cpu_irq = tmp_cpu_irq; /* load */ ++ max_loaded = tmp_loaded; /* processor */ ++ imbalance = (max_cpu_irq - min_cpu_irq) / 2; ++ ++ Dprintk("max_loaded cpu = %d\n", max_loaded); ++ Dprintk("min_loaded cpu = %d\n", min_loaded); ++ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); ++ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); ++ Dprintk("load imbalance = %lu\n", imbalance); ++ ++ /* if imbalance is less than approx 10% of max load, then ++ * observe diminishing returns action. - quit ++ */ ++ if (imbalance < (max_cpu_irq >> 3)) { ++ Dprintk("Imbalance too trivial\n"); ++ goto not_worth_the_effort; ++ } ++ ++tryanotherirq: ++ /* if we select an IRQ to move that can't go where we want, then ++ * see if there is another one to try. ++ */ ++ move_this_load = 0; ++ selected_irq = -1; ++ for (j = 0; j < NR_IRQS; j++) { ++ /* Is this an active IRQ? */ ++ if (!irq_desc[j].action) ++ continue; ++ if (imbalance <= IRQ_DELTA(max_loaded,j)) ++ continue; ++ /* Try to find the IRQ that is closest to the imbalance ++ * without going over. ++ */ ++ if (move_this_load < IRQ_DELTA(max_loaded,j)) { ++ move_this_load = IRQ_DELTA(max_loaded,j); ++ selected_irq = j; ++ } ++ } ++ if (selected_irq == -1) { ++ goto tryanothercpu; ++ } ++ ++ imbalance = move_this_load; ++ ++ /* For physical_balance case, we accumlated both load ++ * values in the one of the siblings cpu_irq[], ++ * to use the same code for physical and logical processors ++ * as much as possible. ++ * ++ * NOTE: the cpu_irq[] array holds the sum of the load for ++ * sibling A and sibling B in the slot for the lowest numbered ++ * sibling (A), _AND_ the load for sibling B in the slot for ++ * the higher numbered sibling. ++ * ++ * We seek the least loaded sibling by making the comparison ++ * (A+B)/2 vs B ++ */ ++ load = CPU_IRQ(min_loaded) >> 1; ++ for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { ++ if (load > CPU_IRQ(j)) { ++ /* This won't change cpu_sibling_map[min_loaded] */ ++ load = CPU_IRQ(j); ++ min_loaded = j; ++ } ++ } ++ ++ cpus_and(allowed_mask, ++ cpu_online_map, ++ balance_irq_affinity[selected_irq]); ++ target_cpu_mask = cpumask_of_cpu(min_loaded); ++ cpus_and(tmp, target_cpu_mask, allowed_mask); ++ ++ if (!cpus_empty(tmp)) { ++ ++ Dprintk("irq = %d moved to cpu = %d\n", ++ selected_irq, min_loaded); ++ /* mark for change destination */ ++ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); ++ ++ /* Since we made a change, come back sooner to ++ * check for more variation. ++ */ ++ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, ++ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); ++ return; ++ } ++ goto tryanotherirq; ++ ++not_worth_the_effort: ++ /* ++ * if we did not find an IRQ to move, then adjust the time interval ++ * upward ++ */ ++ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, ++ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); ++ Dprintk("IRQ worth rotating not found\n"); ++ return; ++} ++ ++static int balanced_irq(void *unused) ++{ ++ int i; ++ unsigned long prev_balance_time = jiffies; ++ long time_remaining = balanced_irq_interval; ++ ++ daemonize("kirqd"); ++ ++ /* push everything to CPU 0 to give us a starting point. */ ++ for (i = 0 ; i < NR_IRQS ; i++) { ++ irq_desc[i].pending_mask = cpumask_of_cpu(0); ++ set_pending_irq(i, cpumask_of_cpu(0)); ++ } ++ ++ for ( ; ; ) { ++ time_remaining = schedule_timeout_interruptible(time_remaining); ++ try_to_freeze(); ++ if (time_after(jiffies, ++ prev_balance_time+balanced_irq_interval)) { ++ preempt_disable(); ++ do_irq_balance(); ++ prev_balance_time = jiffies; ++ time_remaining = balanced_irq_interval; ++ preempt_enable(); ++ } ++ } ++ return 0; ++} ++ ++static int __init balanced_irq_init(void) ++{ ++ int i; ++ struct cpuinfo_x86 *c; ++ cpumask_t tmp; ++ ++ cpus_shift_right(tmp, cpu_online_map, 2); ++ c = &boot_cpu_data; ++ /* When not overwritten by the command line ask subarchitecture. */ ++ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) ++ irqbalance_disabled = NO_BALANCE_IRQ; ++ if (irqbalance_disabled) ++ return 0; ++ ++ /* disable irqbalance completely if there is only one processor online */ ++ if (num_online_cpus() < 2) { ++ irqbalance_disabled = 1; ++ return 0; ++ } ++ /* ++ * Enable physical balance only if more than 1 physical processor ++ * is present ++ */ ++ if (smp_num_siblings > 1 && !cpus_empty(tmp)) ++ physical_balance = 1; ++ ++ for_each_online_cpu(i) { ++ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); ++ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); ++ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { ++ printk(KERN_ERR "balanced_irq_init: out of memory"); ++ goto failed; ++ } ++ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); ++ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); ++ } ++ ++ printk(KERN_INFO "Starting balanced_irq\n"); ++ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) ++ return 0; ++ else ++ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); ++failed: ++ for_each_possible_cpu(i) { ++ kfree(irq_cpu_data[i].irq_delta); ++ irq_cpu_data[i].irq_delta = NULL; ++ kfree(irq_cpu_data[i].last_irq); ++ irq_cpu_data[i].last_irq = NULL; ++ } ++ return 0; ++} ++ ++int __init irqbalance_disable(char *str) ++{ ++ irqbalance_disabled = 1; ++ return 1; ++} ++ ++__setup("noirqbalance", irqbalance_disable); ++ ++late_initcall(balanced_irq_init); ++#endif /* CONFIG_IRQBALANCE */ ++#endif /* CONFIG_SMP */ ++#endif /* !CONFIG_XEN */ ++ ++#ifndef CONFIG_SMP ++void fastcall send_IPI_self(int vector) ++{ ++#ifndef CONFIG_XEN ++ unsigned int cfg; ++ ++ /* ++ * Wait for idle. ++ */ ++ apic_wait_icr_idle(); ++ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; ++ /* ++ * Send the IPI. The write to APIC_ICR fires this off. ++ */ ++ apic_write_around(APIC_ICR, cfg); ++#endif ++} ++#endif /* !CONFIG_SMP */ ++ ++ ++/* ++ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to ++ * specific CPU-side IRQs. ++ */ ++ ++#define MAX_PIRQS 8 ++static int pirq_entries [MAX_PIRQS]; ++static int pirqs_enabled; ++int skip_ioapic_setup; ++ ++static int __init ioapic_setup(char *str) ++{ ++ skip_ioapic_setup = 1; ++ return 1; ++} ++ ++__setup("noapic", ioapic_setup); ++ ++static int __init ioapic_pirq_setup(char *str) ++{ ++ int i, max; ++ int ints[MAX_PIRQS+1]; ++ ++ get_options(str, ARRAY_SIZE(ints), ints); ++ ++ for (i = 0; i < MAX_PIRQS; i++) ++ pirq_entries[i] = -1; ++ ++ pirqs_enabled = 1; ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "PIRQ redirection, working around broken MP-BIOS.\n"); ++ max = MAX_PIRQS; ++ if (ints[0] < MAX_PIRQS) ++ max = ints[0]; ++ ++ for (i = 0; i < max; i++) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); ++ /* ++ * PIRQs are mapped upside down, usually. ++ */ ++ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; ++ } ++ return 1; ++} ++ ++__setup("pirq=", ioapic_pirq_setup); ++ ++/* ++ * Find the IRQ entry number of a certain pin. ++ */ ++static int find_irq_entry(int apic, int pin, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_irqtype == type && ++ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || ++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && ++ mp_irqs[i].mpc_dstirq == pin) ++ return i; ++ ++ return -1; ++} ++ ++/* ++ * Find the pin to which IRQ[irq] (ISA) is connected ++ */ ++static int __init find_isa_irq_pin(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || ++ mp_bus_id_to_type[lbus] == MP_BUS_EISA || ++ mp_bus_id_to_type[lbus] == MP_BUS_MCA ++ ) && ++ (mp_irqs[i].mpc_irqtype == type) && ++ (mp_irqs[i].mpc_srcbusirq == irq)) ++ ++ return mp_irqs[i].mpc_dstirq; ++ } ++ return -1; ++} ++ ++static int __init find_isa_irq_apic(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || ++ mp_bus_id_to_type[lbus] == MP_BUS_EISA || ++ mp_bus_id_to_type[lbus] == MP_BUS_MCA ++ ) && ++ (mp_irqs[i].mpc_irqtype == type) && ++ (mp_irqs[i].mpc_srcbusirq == irq)) ++ break; ++ } ++ if (i < mp_irq_entries) { ++ int apic; ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) ++ return apic; ++ } ++ } ++ ++ return -1; ++} ++ ++/* ++ * Find a specific PCI IRQ entry. ++ * Not an __init, possibly needed by modules ++ */ ++static int pin_2_irq(int idx, int apic, int pin); ++ ++int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) ++{ ++ int apic, i, best_guess = -1; ++ ++ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " ++ "slot:%d, pin:%d.\n", bus, slot, pin); ++ if (mp_bus_id_to_pci_bus[bus] == -1) { ++ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); ++ return -1; ++ } ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || ++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) ++ break; ++ ++ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && ++ !mp_irqs[i].mpc_irqtype && ++ (bus == lbus) && ++ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { ++ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); ++ ++ if (!(apic || IO_APIC_IRQ(irq))) ++ continue; ++ ++ if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) ++ return irq; ++ /* ++ * Use the first all-but-pin matching entry as a ++ * best-guess fuzzy result for broken mptables. ++ */ ++ if (best_guess < 0) ++ best_guess = irq; ++ } ++ } ++ return best_guess; ++} ++EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); ++ ++/* ++ * This function currently is only a helper for the i386 smp boot process where ++ * we need to reprogram the ioredtbls to cater for the cpus which have come online ++ * so mask in all cases should simply be TARGET_CPUS ++ */ ++#ifdef CONFIG_SMP ++#ifndef CONFIG_XEN ++void __init setup_ioapic_dest(void) ++{ ++ int pin, ioapic, irq, irq_entry; ++ ++ if (skip_ioapic_setup == 1) ++ return; ++ ++ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { ++ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { ++ irq_entry = find_irq_entry(ioapic, pin, mp_INT); ++ if (irq_entry == -1) ++ continue; ++ irq = pin_2_irq(irq_entry, ioapic, pin); ++ set_ioapic_affinity_irq(irq, TARGET_CPUS); ++ } ++ ++ } ++} ++#endif /* !CONFIG_XEN */ ++#endif ++ ++/* ++ * EISA Edge/Level control register, ELCR ++ */ ++static int EISA_ELCR(unsigned int irq) ++{ ++ if (irq < 16) { ++ unsigned int port = 0x4d0 + (irq >> 3); ++ return (inb(port) >> (irq & 7)) & 1; ++ } ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "Broken MPtable reports ISA irq %d\n", irq); ++ return 0; ++} ++ ++/* EISA interrupts are always polarity zero and can be edge or level ++ * trigger depending on the ELCR value. If an interrupt is listed as ++ * EISA conforming in the MP table, that means its trigger type must ++ * be read in from the ELCR */ ++ ++#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) ++#define default_EISA_polarity(idx) (0) ++ ++/* ISA interrupts are always polarity zero edge triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_ISA_trigger(idx) (0) ++#define default_ISA_polarity(idx) (0) ++ ++/* PCI interrupts are always polarity one level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_PCI_trigger(idx) (1) ++#define default_PCI_polarity(idx) (1) ++ ++/* MCA interrupts are always polarity zero level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_MCA_trigger(idx) (1) ++#define default_MCA_polarity(idx) (0) ++ ++static int __init MPBIOS_polarity(int idx) ++{ ++ int bus = mp_irqs[idx].mpc_srcbus; ++ int polarity; ++ ++ /* ++ * Determine IRQ line polarity (high active or low active): ++ */ ++ switch (mp_irqs[idx].mpc_irqflag & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent polarity */ ++ { ++ switch (mp_bus_id_to_type[bus]) ++ { ++ case MP_BUS_ISA: /* ISA pin */ ++ { ++ polarity = default_ISA_polarity(idx); ++ break; ++ } ++ case MP_BUS_EISA: /* EISA pin */ ++ { ++ polarity = default_EISA_polarity(idx); ++ break; ++ } ++ case MP_BUS_PCI: /* PCI pin */ ++ { ++ polarity = default_PCI_polarity(idx); ++ break; ++ } ++ case MP_BUS_MCA: /* MCA pin */ ++ { ++ polarity = default_MCA_polarity(idx); ++ break; ++ } ++ default: ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ } ++ break; ++ } ++ case 1: /* high active */ ++ { ++ polarity = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ case 3: /* low active */ ++ { ++ polarity = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ } ++ return polarity; ++} ++ ++static int MPBIOS_trigger(int idx) ++{ ++ int bus = mp_irqs[idx].mpc_srcbus; ++ int trigger; ++ ++ /* ++ * Determine IRQ trigger mode (edge or level sensitive): ++ */ ++ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent */ ++ { ++ switch (mp_bus_id_to_type[bus]) ++ { ++ case MP_BUS_ISA: /* ISA pin */ ++ { ++ trigger = default_ISA_trigger(idx); ++ break; ++ } ++ case MP_BUS_EISA: /* EISA pin */ ++ { ++ trigger = default_EISA_trigger(idx); ++ break; ++ } ++ case MP_BUS_PCI: /* PCI pin */ ++ { ++ trigger = default_PCI_trigger(idx); ++ break; ++ } ++ case MP_BUS_MCA: /* MCA pin */ ++ { ++ trigger = default_MCA_trigger(idx); ++ break; ++ } ++ default: ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ } ++ break; ++ } ++ case 1: /* edge */ ++ { ++ trigger = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ case 3: /* level */ ++ { ++ trigger = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 0; ++ break; ++ } ++ } ++ return trigger; ++} ++ ++static inline int irq_polarity(int idx) ++{ ++ return MPBIOS_polarity(idx); ++} ++ ++static inline int irq_trigger(int idx) ++{ ++ return MPBIOS_trigger(idx); ++} ++ ++static int pin_2_irq(int idx, int apic, int pin) ++{ ++ int irq, i; ++ int bus = mp_irqs[idx].mpc_srcbus; ++ ++ /* ++ * Debugging check, we are in big trouble if this message pops up! ++ */ ++ if (mp_irqs[idx].mpc_dstirq != pin) ++ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); ++ ++ switch (mp_bus_id_to_type[bus]) ++ { ++ case MP_BUS_ISA: /* ISA pin */ ++ case MP_BUS_EISA: ++ case MP_BUS_MCA: ++ { ++ irq = mp_irqs[idx].mpc_srcbusirq; ++ break; ++ } ++ case MP_BUS_PCI: /* PCI pin */ ++ { ++ /* ++ * PCI IRQs are mapped in order ++ */ ++ i = irq = 0; ++ while (i < apic) ++ irq += nr_ioapic_registers[i++]; ++ irq += pin; ++ ++ /* ++ * For MPS mode, so far only needed by ES7000 platform ++ */ ++ if (ioapic_renumber_irq) ++ irq = ioapic_renumber_irq(apic, irq); ++ ++ break; ++ } ++ default: ++ { ++ printk(KERN_ERR "unknown bus type %d.\n",bus); ++ irq = 0; ++ break; ++ } ++ } ++ ++ /* ++ * PCI IRQ command line redirection. Yes, limits are hardcoded. ++ */ ++ if ((pin >= 16) && (pin <= 23)) { ++ if (pirq_entries[pin-16] != -1) { ++ if (!pirq_entries[pin-16]) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "disabling PIRQ%d\n", pin-16); ++ } else { ++ irq = pirq_entries[pin-16]; ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "using PIRQ%d -> IRQ %d\n", ++ pin-16, irq); ++ } ++ } ++ } ++ return irq; ++} ++ ++static inline int IO_APIC_irq_trigger(int irq) ++{ ++ int apic, idx, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ idx = find_irq_entry(apic,pin,mp_INT); ++ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) ++ return irq_trigger(idx); ++ } ++ } ++ /* ++ * nonexistent IRQs are edge default ++ */ ++ return 0; ++} ++ ++/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ ++static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ ++ ++static int __assign_irq_vector(int irq) ++{ ++ int vector; ++ struct physdev_irq irq_op; ++ ++ BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); ++ ++ if (irq_vector[irq] > 0) ++ return irq_vector[irq]; ++ irq_op.irq = irq; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) ++ return -ENOSPC; ++ ++ vector = irq_op.vector; ++ irq_vector[irq] = vector; ++ ++ return vector; ++} ++ ++static int assign_irq_vector(int irq) ++{ ++ unsigned long flags; ++ int vector; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ vector = __assign_irq_vector(irq); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ return vector; ++} ++#ifndef CONFIG_XEN ++static struct irq_chip ioapic_chip; ++ ++#define IOAPIC_AUTO -1 ++#define IOAPIC_EDGE 0 ++#define IOAPIC_LEVEL 1 ++ ++static void ioapic_register_intr(int irq, int vector, unsigned long trigger) ++{ ++ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || ++ trigger == IOAPIC_LEVEL) ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_fasteoi_irq, "fasteoi"); ++ else { ++ irq_desc[irq].status |= IRQ_DELAYED_DISABLE; ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_edge_irq, "edge"); ++ } ++ set_intr_gate(vector, interrupt[irq]); ++} ++#else ++#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) ++#endif ++ ++static void __init setup_IO_APIC_irqs(void) ++{ ++ struct IO_APIC_route_entry entry; ++ int apic, pin, idx, irq, first_notcon = 1, vector; ++ unsigned long flags; ++ ++ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ ++ /* ++ * add it to the IO-APIC irq-routing table: ++ */ ++ memset(&entry,0,sizeof(entry)); ++ ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.dest_mode = INT_DEST_MODE; ++ entry.mask = 0; /* enable IRQ */ ++ entry.dest.logical.logical_dest = ++ cpu_mask_to_apicid(TARGET_CPUS); ++ ++ idx = find_irq_entry(apic,pin,mp_INT); ++ if (idx == -1) { ++ if (first_notcon) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ " IO-APIC (apicid-pin) %d-%d", ++ mp_ioapics[apic].mpc_apicid, ++ pin); ++ first_notcon = 0; ++ } else ++ apic_printk(APIC_VERBOSE, ", %d-%d", ++ mp_ioapics[apic].mpc_apicid, pin); ++ continue; ++ } ++ ++ entry.trigger = irq_trigger(idx); ++ entry.polarity = irq_polarity(idx); ++ ++ if (irq_trigger(idx)) { ++ entry.trigger = 1; ++ entry.mask = 1; ++ } ++ ++ irq = pin_2_irq(idx, apic, pin); ++ /* ++ * skip adding the timer int on secondary nodes, which causes ++ * a small but painful rift in the time-space continuum ++ */ ++ if (multi_timer_check(apic, irq)) ++ continue; ++ else ++ add_pin_to_irq(irq, apic, pin); ++ ++ if (/*!apic &&*/ !IO_APIC_IRQ(irq)) ++ continue; ++ ++ if (IO_APIC_IRQ(irq)) { ++ vector = assign_irq_vector(irq); ++ entry.vector = vector; ++ ioapic_register_intr(irq, vector, IOAPIC_AUTO); ++ ++ if (!apic && (irq < 16)) ++ disable_8259A_irq(irq); ++ } ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __ioapic_write_entry(apic, pin, entry); ++ set_native_irq_info(irq, TARGET_CPUS); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ } ++ } ++ ++ if (!first_notcon) ++ apic_printk(APIC_VERBOSE, " not connected.\n"); ++} ++ ++/* ++ * Set up the 8259A-master output pin: ++ */ ++#ifndef CONFIG_XEN ++static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry,0,sizeof(entry)); ++ ++ disable_8259A_irq(0); ++ ++ /* mask LVT0 */ ++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ ++ /* ++ * We use logical delivery to get the timer IRQ ++ * to the first CPU. ++ */ ++ entry.dest_mode = INT_DEST_MODE; ++ entry.mask = 0; /* unmask IRQ now */ ++ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.polarity = 0; ++ entry.trigger = 0; ++ entry.vector = vector; ++ ++ /* ++ * The timer IRQ doesn't have to know that behind the ++ * scene we have a 8259A-master in AEOI mode ... ++ */ ++ irq_desc[0].chip = &ioapic_chip; ++ set_irq_handler(0, handle_edge_irq); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(apic, pin, entry); ++ ++ enable_8259A_irq(0); ++} ++ ++static inline void UNEXPECTED_IO_APIC(void) ++{ ++} ++ ++void __init print_IO_APIC(void) ++{ ++ int apic, i; ++ union IO_APIC_reg_00 reg_00; ++ union IO_APIC_reg_01 reg_01; ++ union IO_APIC_reg_02 reg_02; ++ union IO_APIC_reg_03 reg_03; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); ++ for (i = 0; i < nr_ioapics; i++) ++ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", ++ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); ++ ++ /* ++ * We are a bit conservative about what we expect. We have to ++ * know about every hardware change ASAP. ++ */ ++ printk(KERN_INFO "testing the IO APIC.......................\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ reg_01.raw = io_apic_read(apic, 1); ++ if (reg_01.bits.version >= 0x10) ++ reg_02.raw = io_apic_read(apic, 2); ++ if (reg_01.bits.version >= 0x20) ++ reg_03.raw = io_apic_read(apic, 3); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); ++ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); ++ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); ++ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); ++ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); ++ if (reg_00.bits.ID >= get_physical_broadcast()) ++ UNEXPECTED_IO_APIC(); ++ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) ++ UNEXPECTED_IO_APIC(); ++ ++ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); ++ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); ++ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ ++ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ ++ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ ++ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ ++ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ ++ (reg_01.bits.entries != 0x2E) && ++ (reg_01.bits.entries != 0x3F) ++ ) ++ UNEXPECTED_IO_APIC(); ++ ++ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); ++ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); ++ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ ++ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ ++ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ ++ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ ++ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ ++ ) ++ UNEXPECTED_IO_APIC(); ++ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) ++ UNEXPECTED_IO_APIC(); ++ ++ /* ++ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, ++ * but the value of reg_02 is read as the previous read register ++ * value, so ignore it if reg_02 == reg_01. ++ */ ++ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { ++ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); ++ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); ++ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) ++ UNEXPECTED_IO_APIC(); ++ } ++ ++ /* ++ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 ++ * or reg_03, but the value of reg_0[23] is read as the previous read ++ * register value, so ignore it if reg_03 == reg_0[12]. ++ */ ++ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && ++ reg_03.raw != reg_01.raw) { ++ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); ++ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); ++ if (reg_03.bits.__reserved_1) ++ UNEXPECTED_IO_APIC(); ++ } ++ ++ printk(KERN_DEBUG ".... IRQ redirection table:\n"); ++ ++ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" ++ " Stat Dest Deli Vect: \n"); ++ ++ for (i = 0; i <= reg_01.bits.entries; i++) { ++ struct IO_APIC_route_entry entry; ++ ++ entry = ioapic_read_entry(apic, i); ++ ++ printk(KERN_DEBUG " %02x %03X %02X ", ++ i, ++ entry.dest.logical.logical_dest, ++ entry.dest.physical.physical_dest ++ ); ++ ++ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", ++ entry.mask, ++ entry.trigger, ++ entry.irr, ++ entry.polarity, ++ entry.delivery_status, ++ entry.dest_mode, ++ entry.delivery_mode, ++ entry.vector ++ ); ++ } ++ } ++ printk(KERN_DEBUG "IRQ to pin mappings:\n"); ++ for (i = 0; i < NR_IRQS; i++) { ++ struct irq_pin_list *entry = irq_2_pin + i; ++ if (entry->pin < 0) ++ continue; ++ printk(KERN_DEBUG "IRQ%d ", i); ++ for (;;) { ++ printk("-> %d:%d", entry->apic, entry->pin); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++ printk("\n"); ++ } ++ ++ printk(KERN_INFO ".................................... done.\n"); ++ ++ return; ++} ++ ++#if 0 ++ ++static void print_APIC_bitfield (int base) ++{ ++ unsigned int v; ++ int i, j; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); ++ for (i = 0; i < 8; i++) { ++ v = apic_read(base + i*0x10); ++ for (j = 0; j < 32; j++) { ++ if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ ++ apic_write(APIC_ESR, 0); ++ v = apic_read(APIC_ESR); ++ printk(KERN_DEBUG "... APIC ESR: %08x\n", v); ++ } ++ ++ v = apic_read(APIC_ICR); ++ printk(KERN_DEBUG "... APIC ICR: %08x\n", v); ++ v = apic_read(APIC_ICR2); ++ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); ++ ++ v = apic_read(APIC_LVTT); ++ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); ++ ++ if (maxlvt > 3) { /* PC is LVT#4. */ ++ v = apic_read(APIC_LVTPC); ++ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); ++ } ++ v = apic_read(APIC_LVT0); ++ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); ++ v = apic_read(APIC_LVT1); ++ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); ++ ++ if (maxlvt > 2) { /* ERR is LVT#3. */ ++ v = apic_read(APIC_LVTERR); ++ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); ++ } ++ ++ v = apic_read(APIC_TMICT); ++ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); ++ v = apic_read(APIC_TMCCT); ++ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); ++ v = apic_read(APIC_TDCR); ++ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); ++ printk("\n"); ++} ++ ++void print_all_local_APICs (void) ++{ ++ on_each_cpu(print_local_APIC, NULL, 1, 1); ++} ++ ++void /*__init*/ print_PIC(void) ++{ ++ unsigned int v; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "\nprinting PIC contents\n"); ++ ++ spin_lock_irqsave(&i8259A_lock, flags); ++ ++ v = inb(0xa1) << 8 | inb(0x21); ++ printk(KERN_DEBUG "... PIC IMR: %04x\n", v); ++ ++ v = inb(0xa0) << 8 | inb(0x20); ++ printk(KERN_DEBUG "... PIC IRR: %04x\n", v); ++ ++ outb(0x0b,0xa0); ++ outb(0x0b,0x20); ++ v = inb(0xa0) << 8 | inb(0x20); ++ outb(0x0a,0xa0); ++ outb(0x0a,0x20); ++ ++ spin_unlock_irqrestore(&i8259A_lock, flags); ++ ++ printk(KERN_DEBUG "... PIC ISR: %04x\n", v); ++ ++ v = inb(0x4d1) << 8 | inb(0x4d0); ++ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); ++} ++ ++#endif /* 0 */ ++ ++#else ++void __init print_IO_APIC(void) { } ++#endif /* !CONFIG_XEN */ ++ ++static void __init enable_IO_APIC(void) ++{ ++ union IO_APIC_reg_01 reg_01; ++ int i8259_apic, i8259_pin; ++ int i, apic; ++ unsigned long flags; ++ ++ for (i = 0; i < PIN_MAP_SIZE; i++) { ++ irq_2_pin[i].pin = -1; ++ irq_2_pin[i].next = 0; ++ } ++ if (!pirqs_enabled) ++ for (i = 0; i < MAX_PIRQS; i++) ++ pirq_entries[i] = -1; ++ ++ /* ++ * The number of IO-APIC IRQ registers (== #pins): ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(apic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ nr_ioapic_registers[apic] = reg_01.bits.entries+1; ++ } ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ int pin; ++ /* See if any of the pins is in ExtINT mode */ ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ struct IO_APIC_route_entry entry; ++ entry = ioapic_read_entry(apic, pin); ++ ++ ++ /* If the interrupt line is enabled and in ExtInt mode ++ * I have found the pin where the i8259 is connected. ++ */ ++ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { ++ ioapic_i8259.apic = apic; ++ ioapic_i8259.pin = pin; ++ goto found_i8259; ++ } ++ } ++ } ++ found_i8259: ++ /* Look to see what if the MP table has reported the ExtINT */ ++ /* If we could not find the appropriate pin by looking at the ioapic ++ * the i8259 probably is not connected the ioapic but give the ++ * mptable a chance anyway. ++ */ ++ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); ++ i8259_apic = find_isa_irq_apic(0, mp_ExtINT); ++ /* Trust the MP table if nothing is setup in the hardware */ ++ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { ++ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); ++ ioapic_i8259.pin = i8259_pin; ++ ioapic_i8259.apic = i8259_apic; ++ } ++ /* Complain if the MP table and the hardware disagree */ ++ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && ++ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) ++ { ++ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); ++ } ++ ++ /* ++ * Do not trust the IO-APIC being empty at bootup ++ */ ++ clear_IO_APIC(); ++} ++ ++/* ++ * Not an __init, needed by the reboot code ++ */ ++void disable_IO_APIC(void) ++{ ++ /* ++ * Clear the IO-APIC before rebooting: ++ */ ++ clear_IO_APIC(); ++ ++#ifndef CONFIG_XEN ++ /* ++ * If the i8259 is routed through an IOAPIC ++ * Put that IOAPIC in virtual wire mode ++ * so legacy interrupts can be delivered. ++ */ ++ if (ioapic_i8259.pin != -1) { ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry, 0, sizeof(entry)); ++ entry.mask = 0; /* Enabled */ ++ entry.trigger = 0; /* Edge */ ++ entry.irr = 0; ++ entry.polarity = 0; /* High */ ++ entry.delivery_status = 0; ++ entry.dest_mode = 0; /* Physical */ ++ entry.delivery_mode = dest_ExtINT; /* ExtInt */ ++ entry.vector = 0; ++ entry.dest.physical.physical_dest = ++ GET_APIC_ID(apic_read(APIC_ID)); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); ++ } ++ disconnect_bsp_APIC(ioapic_i8259.pin != -1); ++#endif ++} ++ ++/* ++ * function to set the IO-APIC physical IDs based on the ++ * values stored in the MPC table. ++ * ++ * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 ++ */ ++ ++#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) ++static void __init setup_ioapic_ids_from_mpc(void) ++{ ++ union IO_APIC_reg_00 reg_00; ++ physid_mask_t phys_id_present_map; ++ int apic; ++ int i; ++ unsigned char old_id; ++ unsigned long flags; ++ ++ /* ++ * Don't check I/O APIC IDs for xAPIC systems. They have ++ * no meaning without the serial APIC bus. ++ */ ++ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) ++ return; ++ /* ++ * This is broken; anything with a real cpu count has to ++ * circumvent this idiocy regardless. ++ */ ++ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); ++ ++ /* ++ * Set the IOAPIC ID to the value stored in the MPC table. ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ ++ /* Read the register 0 value */ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ old_id = mp_ioapics[apic].mpc_apicid; ++ ++ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { ++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", ++ apic, mp_ioapics[apic].mpc_apicid); ++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", ++ reg_00.bits.ID); ++ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; ++ } ++ ++ /* ++ * Sanity check, is the ID really free? Every APIC in a ++ * system must have a unique ID or we get lots of nice ++ * 'stuck on smp_invalidate_needed IPI wait' messages. ++ */ ++ if (check_apicid_used(phys_id_present_map, ++ mp_ioapics[apic].mpc_apicid)) { ++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", ++ apic, mp_ioapics[apic].mpc_apicid); ++ for (i = 0; i < get_physical_broadcast(); i++) ++ if (!physid_isset(i, phys_id_present_map)) ++ break; ++ if (i >= get_physical_broadcast()) ++ panic("Max APIC ID exceeded!\n"); ++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", ++ i); ++ physid_set(i, phys_id_present_map); ++ mp_ioapics[apic].mpc_apicid = i; ++ } else { ++ physid_mask_t tmp; ++ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); ++ apic_printk(APIC_VERBOSE, "Setting %d in the " ++ "phys_id_present_map\n", ++ mp_ioapics[apic].mpc_apicid); ++ physids_or(phys_id_present_map, phys_id_present_map, tmp); ++ } ++ ++ ++ /* ++ * We need to adjust the IRQ routing table ++ * if the ID changed. ++ */ ++ if (old_id != mp_ioapics[apic].mpc_apicid) ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_dstapic == old_id) ++ mp_irqs[i].mpc_dstapic ++ = mp_ioapics[apic].mpc_apicid; ++ ++ /* ++ * Read the right value from the MPC table and ++ * write it into the ID register. ++ */ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "...changing IO-APIC physical APIC ID to %d ...", ++ mp_ioapics[apic].mpc_apicid); ++ ++ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic, 0, reg_00.raw); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ /* ++ * Sanity check ++ */ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) ++ printk("could not set ID!\n"); ++ else ++ apic_printk(APIC_VERBOSE, " ok.\n"); ++ } ++} ++#else ++static void __init setup_ioapic_ids_from_mpc(void) { } ++#endif ++ ++static int no_timer_check __initdata; ++ ++static int __init notimercheck(char *s) ++{ ++ no_timer_check = 1; ++ return 1; ++} ++__setup("no_timer_check", notimercheck); ++ ++#ifndef CONFIG_XEN ++/* ++ * There is a nasty bug in some older SMP boards, their mptable lies ++ * about the timer IRQ. We do the following to work around the situation: ++ * ++ * - timer IRQ defaults to IO-APIC IRQ ++ * - if this function detects that timer IRQs are defunct, then we fall ++ * back to ISA timer IRQs ++ */ ++int __init timer_irq_works(void) ++{ ++ unsigned long t1 = jiffies; ++ ++ if (no_timer_check) ++ return 1; ++ ++ local_irq_enable(); ++ /* Let ten ticks pass... */ ++ mdelay((10 * 1000) / HZ); ++ ++ /* ++ * Expect a few ticks at least, to be sure some possible ++ * glue logic does not lock up after one or two first ++ * ticks in a non-ExtINT mode. Also the local APIC ++ * might have cached one ExtINT interrupt. Finally, at ++ * least one tick may be lost due to delays. ++ */ ++ if (jiffies - t1 > 4) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * In the SMP+IOAPIC case it might happen that there are an unspecified ++ * number of pending IRQ events unhandled. These cases are very rare, ++ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much ++ * better to do it this way as thus we do not have to be aware of ++ * 'pending' interrupts in the IRQ path, except at this point. ++ */ ++/* ++ * Edge triggered needs to resend any interrupt ++ * that was delayed but this is now handled in the device ++ * independent code. ++ */ ++ ++/* ++ * Startup quirk: ++ * ++ * Starting up a edge-triggered IO-APIC interrupt is ++ * nasty - we need to make sure that we get the edge. ++ * If it is already asserted for some reason, we need ++ * return 1 to indicate that is was pending. ++ * ++ * This is not complete - we should be able to fake ++ * an edge even if it isn't on the 8259A... ++ * ++ * (We do this for level-triggered IRQs too - it cannot hurt.) ++ */ ++static unsigned int startup_ioapic_irq(unsigned int irq) ++{ ++ int was_pending = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ if (irq < 16) { ++ disable_8259A_irq(irq); ++ if (i8259A_irq_pending(irq)) ++ was_pending = 1; ++ } ++ __unmask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return was_pending; ++} ++ ++static void ack_ioapic_irq(unsigned int irq) ++{ ++ move_native_irq(irq); ++ ack_APIC_irq(); ++} ++ ++static void ack_ioapic_quirk_irq(unsigned int irq) ++{ ++ unsigned long v; ++ int i; ++ ++ move_native_irq(irq); ++/* ++ * It appears there is an erratum which affects at least version 0x11 ++ * of I/O APIC (that's the 82093AA and cores integrated into various ++ * chipsets). Under certain conditions a level-triggered interrupt is ++ * erroneously delivered as edge-triggered one but the respective IRR ++ * bit gets set nevertheless. As a result the I/O unit expects an EOI ++ * message but it will never arrive and further interrupts are blocked ++ * from the source. The exact reason is so far unknown, but the ++ * phenomenon was observed when two consecutive interrupt requests ++ * from a given source get delivered to the same CPU and the source is ++ * temporarily disabled in between. ++ * ++ * A workaround is to simulate an EOI message manually. We achieve it ++ * by setting the trigger mode to edge and then to level when the edge ++ * trigger mode gets detected in the TMR of a local APIC for a ++ * level-triggered interrupt. We mask the source for the time of the ++ * operation to prevent an edge-triggered interrupt escaping meanwhile. ++ * The idea is from Manfred Spraul. --macro ++ */ ++ i = irq_vector[irq]; ++ ++ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); ++ ++ ack_APIC_irq(); ++ ++ if (!(v & (1 << (i & 0x1f)))) { ++ atomic_inc(&irq_mis_count); ++ spin_lock(&ioapic_lock); ++ __mask_and_edge_IO_APIC_irq(irq); ++ __unmask_and_level_IO_APIC_irq(irq); ++ spin_unlock(&ioapic_lock); ++ } ++} ++ ++static int ioapic_retrigger_irq(unsigned int irq) ++{ ++ send_IPI_self(irq_vector[irq]); ++ ++ return 1; ++} ++ ++static struct irq_chip ioapic_chip __read_mostly = { ++ .name = "IO-APIC", ++ .startup = startup_ioapic_irq, ++ .mask = mask_IO_APIC_irq, ++ .unmask = unmask_IO_APIC_irq, ++ .ack = ack_ioapic_irq, ++ .eoi = ack_ioapic_quirk_irq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ioapic_affinity_irq, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++#endif /* !CONFIG_XEN */ ++ ++static inline void init_IO_APIC_traps(void) ++{ ++ int irq; ++ ++ /* ++ * NOTE! The local APIC isn't very good at handling ++ * multiple interrupts at the same interrupt level. ++ * As the interrupt level is determined by taking the ++ * vector number and shifting that right by 4, we ++ * want to spread these out a bit so that they don't ++ * all fall in the same interrupt level. ++ * ++ * Also, we've got to be careful not to trash gate ++ * 0x80, because int 0x80 is hm, kind of importantish. ;) ++ */ ++ for (irq = 0; irq < NR_IRQS ; irq++) { ++ int tmp = irq; ++ if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { ++ /* ++ * Hmm.. We don't have an entry for this, ++ * so default to an old-fashioned 8259 ++ * interrupt if we can.. ++ */ ++ if (irq < 16) ++ make_8259A_irq(irq); ++#ifndef CONFIG_XEN ++ else ++ /* Strange. Oh, well.. */ ++ irq_desc[irq].chip = &no_irq_chip; ++#endif ++ } ++ } ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * The local APIC irq-chip implementation: ++ */ ++ ++static void ack_apic(unsigned int irq) ++{ ++ ack_APIC_irq(); ++} ++ ++static void mask_lapic_irq (unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); ++} ++ ++static void unmask_lapic_irq (unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); ++} ++ ++static struct irq_chip lapic_chip __read_mostly = { ++ .name = "local-APIC-edge", ++ .mask = mask_lapic_irq, ++ .unmask = unmask_lapic_irq, ++ .eoi = ack_apic, ++}; ++ ++static void setup_nmi (void) ++{ ++ /* ++ * Dirty trick to enable the NMI watchdog ... ++ * We put the 8259A master into AEOI mode and ++ * unmask on all local APICs LVT0 as NMI. ++ * ++ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') ++ * is from Maciej W. Rozycki - so we do not have to EOI from ++ * the NMI handler or the timer interrupt. ++ */ ++ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); ++ ++ on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); ++ ++ apic_printk(APIC_VERBOSE, " done.\n"); ++} ++ ++/* ++ * This looks a bit hackish but it's about the only one way of sending ++ * a few INTA cycles to 8259As and any associated glue logic. ICR does ++ * not support the ExtINT mode, unfortunately. We need to send these ++ * cycles as some i82489DX-based boards have glue logic that keeps the ++ * 8259A interrupt line asserted until INTA. --macro ++ */ ++static inline void unlock_ExtINT_logic(void) ++{ ++ int apic, pin, i; ++ struct IO_APIC_route_entry entry0, entry1; ++ unsigned char save_control, save_freq_select; ++ ++ pin = find_isa_irq_pin(8, mp_INT); ++ if (pin == -1) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ apic = find_isa_irq_apic(8, mp_INT); ++ if (apic == -1) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ ++ entry0 = ioapic_read_entry(apic, pin); ++ clear_IO_APIC_pin(apic, pin); ++ ++ memset(&entry1, 0, sizeof(entry1)); ++ ++ entry1.dest_mode = 0; /* physical delivery */ ++ entry1.mask = 0; /* unmask IRQ now */ ++ entry1.dest.physical.physical_dest = hard_smp_processor_id(); ++ entry1.delivery_mode = dest_ExtINT; ++ entry1.polarity = entry0.polarity; ++ entry1.trigger = 0; ++ entry1.vector = 0; ++ ++ ioapic_write_entry(apic, pin, entry1); ++ ++ save_control = CMOS_READ(RTC_CONTROL); ++ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); ++ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, ++ RTC_FREQ_SELECT); ++ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); ++ ++ i = 100; ++ while (i-- > 0) { ++ mdelay(10); ++ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) ++ i -= 10; ++ } ++ ++ CMOS_WRITE(save_control, RTC_CONTROL); ++ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); ++ clear_IO_APIC_pin(apic, pin); ++ ++ ioapic_write_entry(apic, pin, entry0); ++} ++#endif /* !CONFIG_XEN */ ++ ++int timer_uses_ioapic_pin_0; ++ ++#ifndef CONFIG_XEN ++/* ++ * This code may look a bit paranoid, but it's supposed to cooperate with ++ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ ++ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast ++ * fanatically on his truly buggy board. ++ */ ++static inline void __init check_timer(void) ++{ ++ int apic1, pin1, apic2, pin2; ++ int vector; ++ ++ /* ++ * get/set the timer IRQ vector: ++ */ ++ disable_8259A_irq(0); ++ vector = assign_irq_vector(0); ++ set_intr_gate(vector, interrupt[0]); ++ ++ /* ++ * Subtle, code in do_timer_interrupt() expects an AEOI ++ * mode for the 8259A whenever interrupts are routed ++ * through I/O APICs. Also IRQ0 has to be enabled in ++ * the 8259A which implies the virtual wire has to be ++ * disabled in the local APIC. ++ */ ++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ init_8259A(1); ++ timer_ack = 1; ++ if (timer_over_8254 > 0) ++ enable_8259A_irq(0); ++ ++ pin1 = find_isa_irq_pin(0, mp_INT); ++ apic1 = find_isa_irq_apic(0, mp_INT); ++ pin2 = ioapic_i8259.pin; ++ apic2 = ioapic_i8259.apic; ++ ++ if (pin1 == 0) ++ timer_uses_ioapic_pin_0 = 1; ++ ++ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", ++ vector, apic1, pin1, apic2, pin2); ++ ++ if (pin1 != -1) { ++ /* ++ * Ok, does IRQ0 through the IOAPIC work? ++ */ ++ unmask_IO_APIC_irq(0); ++ if (timer_irq_works()) { ++ if (nmi_watchdog == NMI_IO_APIC) { ++ disable_8259A_irq(0); ++ setup_nmi(); ++ enable_8259A_irq(0); ++ } ++ if (disable_timer_pin_1 > 0) ++ clear_IO_APIC_pin(0, pin1); ++ return; ++ } ++ clear_IO_APIC_pin(apic1, pin1); ++ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " ++ "IO-APIC\n"); ++ } ++ ++ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); ++ if (pin2 != -1) { ++ printk("\n..... (found pin %d) ...", pin2); ++ /* ++ * legacy devices should be connected to IO APIC #0 ++ */ ++ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); ++ if (timer_irq_works()) { ++ printk("works.\n"); ++ if (pin1 != -1) ++ replace_pin_at_irq(0, apic1, pin1, apic2, pin2); ++ else ++ add_pin_to_irq(0, apic2, pin2); ++ if (nmi_watchdog == NMI_IO_APIC) { ++ setup_nmi(); ++ } ++ return; ++ } ++ /* ++ * Cleanup, just in case ... ++ */ ++ clear_IO_APIC_pin(apic2, pin2); ++ } ++ printk(" failed.\n"); ++ ++ if (nmi_watchdog == NMI_IO_APIC) { ++ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); ++ nmi_watchdog = 0; ++ } ++ ++ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); ++ ++ disable_8259A_irq(0); ++ set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, ++ "fasteio"); ++ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ ++ enable_8259A_irq(0); ++ ++ if (timer_irq_works()) { ++ printk(" works.\n"); ++ return; ++ } ++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); ++ printk(" failed.\n"); ++ ++ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); ++ ++ timer_ack = 0; ++ init_8259A(0); ++ make_8259A_irq(0); ++ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); ++ ++ unlock_ExtINT_logic(); ++ ++ if (timer_irq_works()) { ++ printk(" works.\n"); ++ return; ++ } ++ printk(" failed :(.\n"); ++ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " ++ "report. Then try booting with the 'noapic' option"); ++} ++#else ++#define check_timer() ((void)0) ++#endif /* CONFIG_XEN */ ++ ++/* ++ * ++ * IRQ's that are handled by the PIC in the MPS IOAPIC case. ++ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. ++ * Linux doesn't really care, as it's not actually used ++ * for any interrupt handling anyway. ++ */ ++#define PIC_IRQS (1 << PIC_CASCADE_IR) ++ ++void __init setup_IO_APIC(void) ++{ ++ enable_IO_APIC(); ++ ++ if (acpi_ioapic) ++ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ ++ else ++ io_apic_irqs = ~PIC_IRQS; ++ ++ printk("ENABLING IO-APIC IRQs\n"); ++ ++ /* ++ * Set up IO-APIC IRQ routing. ++ */ ++ if (!acpi_ioapic) ++ setup_ioapic_ids_from_mpc(); ++#ifndef CONFIG_XEN ++ sync_Arb_IDs(); ++#endif ++ setup_IO_APIC_irqs(); ++ init_IO_APIC_traps(); ++ check_timer(); ++ if (!acpi_ioapic) ++ print_IO_APIC(); ++} ++ ++static int __init setup_disable_8254_timer(char *s) ++{ ++ timer_over_8254 = -1; ++ return 1; ++} ++static int __init setup_enable_8254_timer(char *s) ++{ ++ timer_over_8254 = 2; ++ return 1; ++} ++ ++__setup("disable_8254_timer", setup_disable_8254_timer); ++__setup("enable_8254_timer", setup_enable_8254_timer); ++ ++/* ++ * Called after all the initialization is done. If we didnt find any ++ * APIC bugs then we can allow the modify fast path ++ */ ++ ++static int __init io_apic_bug_finalize(void) ++{ ++ if(sis_apic_bug == -1) ++ sis_apic_bug = 0; ++ if (is_initial_xendomain()) { ++ struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; ++ op.u.platform_quirk.quirk_id = sis_apic_bug ? ++ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; ++ HYPERVISOR_platform_op(&op); ++ } ++ return 0; ++} ++ ++late_initcall(io_apic_bug_finalize); ++ ++struct sysfs_ioapic_data { ++ struct sys_device dev; ++ struct IO_APIC_route_entry entry[0]; ++}; ++static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; ++ ++static int ioapic_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) ++ entry[i] = ioapic_read_entry(dev->id, i); ++ ++ return 0; ++} ++ ++static int ioapic_resume(struct sys_device *dev) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ unsigned long flags; ++ union IO_APIC_reg_00 reg_00; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(dev->id, 0); ++ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { ++ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; ++ io_apic_write(dev->id, 0, reg_00.raw); ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) ++ ioapic_write_entry(dev->id, i, entry[i]); ++ ++ return 0; ++} ++ ++static struct sysdev_class ioapic_sysdev_class = { ++ set_kset_name("ioapic"), ++ .suspend = ioapic_suspend, ++ .resume = ioapic_resume, ++}; ++ ++static int __init ioapic_init_sysfs(void) ++{ ++ struct sys_device * dev; ++ int i, size, error = 0; ++ ++ error = sysdev_class_register(&ioapic_sysdev_class); ++ if (error) ++ return error; ++ ++ for (i = 0; i < nr_ioapics; i++ ) { ++ size = sizeof(struct sys_device) + nr_ioapic_registers[i] ++ * sizeof(struct IO_APIC_route_entry); ++ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); ++ if (!mp_ioapic_data[i]) { ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ memset(mp_ioapic_data[i], 0, size); ++ dev = &mp_ioapic_data[i]->dev; ++ dev->id = i; ++ dev->cls = &ioapic_sysdev_class; ++ error = sysdev_register(dev); ++ if (error) { ++ kfree(mp_ioapic_data[i]); ++ mp_ioapic_data[i] = NULL; ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ } ++ ++ return 0; ++} ++ ++device_initcall(ioapic_init_sysfs); ++ ++/* ++ * Dynamic irq allocate and deallocation ++ */ ++int create_irq(void) ++{ ++ /* Allocate an unused irq */ ++ int irq, new, vector = 0; ++ unsigned long flags; ++ ++ irq = -ENOSPC; ++ spin_lock_irqsave(&vector_lock, flags); ++ for (new = (NR_IRQS - 1); new >= 0; new--) { ++ if (platform_legacy_irq(new)) ++ continue; ++ if (irq_vector[new] != 0) ++ continue; ++ vector = __assign_irq_vector(new); ++ if (likely(vector > 0)) ++ irq = new; ++ break; ++ } ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ if (irq >= 0) { ++#ifndef CONFIG_XEN ++ set_intr_gate(vector, interrupt[irq]); ++#endif ++ dynamic_irq_init(irq); ++ } ++ return irq; ++} ++ ++void destroy_irq(unsigned int irq) ++{ ++ unsigned long flags; ++ ++ dynamic_irq_cleanup(irq); ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ irq_vector[irq] = 0; ++ spin_unlock_irqrestore(&vector_lock, flags); ++} ++ ++/* ++ * MSI mesage composition ++ */ ++#ifdef CONFIG_PCI_MSI ++static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) ++{ ++ int vector; ++ unsigned dest; ++ ++ vector = assign_irq_vector(irq); ++ if (vector >= 0) { ++ dest = cpu_mask_to_apicid(TARGET_CPUS); ++ ++ msg->address_hi = MSI_ADDR_BASE_HI; ++ msg->address_lo = ++ MSI_ADDR_BASE_LO | ++ ((INT_DEST_MODE == 0) ? ++ MSI_ADDR_DEST_MODE_PHYSICAL: ++ MSI_ADDR_DEST_MODE_LOGICAL) | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ MSI_ADDR_REDIRECTION_CPU: ++ MSI_ADDR_REDIRECTION_LOWPRI) | ++ MSI_ADDR_DEST_ID(dest); ++ ++ msg->data = ++ MSI_DATA_TRIGGER_EDGE | ++ MSI_DATA_LEVEL_ASSERT | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ MSI_DATA_DELIVERY_FIXED: ++ MSI_DATA_DELIVERY_LOWPRI) | ++ MSI_DATA_VECTOR(vector); ++ } ++ return vector; ++} ++ ++#ifdef CONFIG_SMP ++static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ struct msi_msg msg; ++ unsigned int dest; ++ cpumask_t tmp; ++ int vector; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ tmp = TARGET_CPUS; ++ ++ vector = assign_irq_vector(irq); ++ if (vector < 0) ++ return; ++ ++ dest = cpu_mask_to_apicid(mask); ++ ++ read_msi_msg(irq, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ write_msi_msg(irq, &msg); ++ set_native_irq_info(irq, mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, ++ * which implement the MSI or MSI-X Capability Structure. ++ */ ++static struct irq_chip msi_chip = { ++ .name = "PCI-MSI", ++ .unmask = unmask_msi_irq, ++ .mask = mask_msi_irq, ++ .ack = ack_ioapic_irq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_msi_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev) ++{ ++ struct msi_msg msg; ++ int ret; ++ ret = msi_compose_msg(dev, irq, &msg); ++ if (ret < 0) ++ return ret; ++ ++ write_msi_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, ++ "edge"); ++ ++ return 0; ++} ++ ++void arch_teardown_msi_irq(unsigned int irq) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PCI_MSI */ ++ ++/* ++ * Hypertransport interrupt support ++ */ ++#ifdef CONFIG_HT_IRQ ++ ++#ifdef CONFIG_SMP ++ ++static void target_ht_irq(unsigned int irq, unsigned int dest) ++{ ++ struct ht_irq_msg msg; ++ fetch_ht_irq_msg(irq, &msg); ++ ++ msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); ++ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); ++ ++ msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); ++ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); ++ ++ write_ht_irq_msg(irq, &msg); ++} ++ ++static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ unsigned int dest; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ tmp = TARGET_CPUS; ++ ++ cpus_and(mask, tmp, CPU_MASK_ALL); ++ ++ dest = cpu_mask_to_apicid(mask); ++ ++ target_ht_irq(irq, dest); ++ set_native_irq_info(irq, mask); ++} ++#endif ++ ++static struct irq_chip ht_irq_chip = { ++ .name = "PCI-HT", ++ .mask = mask_ht_irq, ++ .unmask = unmask_ht_irq, ++ .ack = ack_ioapic_irq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ht_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) ++{ ++ int vector; ++ ++ vector = assign_irq_vector(irq); ++ if (vector >= 0) { ++ struct ht_irq_msg msg; ++ unsigned dest; ++ cpumask_t tmp; ++ ++ cpus_clear(tmp); ++ cpu_set(vector >> 8, tmp); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); ++ ++ msg.address_lo = ++ HT_IRQ_LOW_BASE | ++ HT_IRQ_LOW_DEST_ID(dest) | ++ HT_IRQ_LOW_VECTOR(vector) | ++ ((INT_DEST_MODE == 0) ? ++ HT_IRQ_LOW_DM_PHYSICAL : ++ HT_IRQ_LOW_DM_LOGICAL) | ++ HT_IRQ_LOW_RQEOI_EDGE | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ HT_IRQ_LOW_MT_FIXED : ++ HT_IRQ_LOW_MT_ARBITRATED) | ++ HT_IRQ_LOW_IRQ_MASKED; ++ ++ write_ht_irq_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &ht_irq_chip, ++ handle_edge_irq, "edge"); ++ } ++ return vector; ++} ++#endif /* CONFIG_HT_IRQ */ ++ ++/* -------------------------------------------------------------------------- ++ ACPI-based IOAPIC Configuration ++ -------------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_ACPI ++ ++int __init io_apic_get_unique_id (int ioapic, int apic_id) ++{ ++#ifndef CONFIG_XEN ++ union IO_APIC_reg_00 reg_00; ++ static physid_mask_t apic_id_map = PHYSID_MASK_NONE; ++ physid_mask_t tmp; ++ unsigned long flags; ++ int i = 0; ++ ++ /* ++ * The P4 platform supports up to 256 APIC IDs on two separate APIC ++ * buses (one for LAPICs, one for IOAPICs), where predecessors only ++ * supports up to 16 on one shared APIC bus. ++ * ++ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full ++ * advantage of new APIC bus architecture. ++ */ ++ ++ if (physids_empty(apic_id_map)) ++ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(ioapic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ if (apic_id >= get_physical_broadcast()) { ++ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " ++ "%d\n", ioapic, apic_id, reg_00.bits.ID); ++ apic_id = reg_00.bits.ID; ++ } ++ ++ /* ++ * Every APIC in a system must have a unique ID or we get lots of nice ++ * 'stuck on smp_invalidate_needed IPI wait' messages. ++ */ ++ if (check_apicid_used(apic_id_map, apic_id)) { ++ ++ for (i = 0; i < get_physical_broadcast(); i++) { ++ if (!check_apicid_used(apic_id_map, i)) ++ break; ++ } ++ ++ if (i == get_physical_broadcast()) ++ panic("Max apic_id exceeded!\n"); ++ ++ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " ++ "trying %d\n", ioapic, apic_id, i); ++ ++ apic_id = i; ++ } ++ ++ tmp = apicid_to_cpu_present(apic_id); ++ physids_or(apic_id_map, apic_id_map, tmp); ++ ++ if (reg_00.bits.ID != apic_id) { ++ reg_00.bits.ID = apic_id; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(ioapic, 0, reg_00.raw); ++ reg_00.raw = io_apic_read(ioapic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ /* Sanity check */ ++ if (reg_00.bits.ID != apic_id) { ++ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); ++ return -1; ++ } ++ } ++ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); ++#endif /* !CONFIG_XEN */ ++ ++ return apic_id; ++} ++ ++ ++int __init io_apic_get_version (int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.version; ++} ++ ++ ++int __init io_apic_get_redir_entries (int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.entries; ++} ++ ++ ++int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) ++{ ++ struct IO_APIC_route_entry entry; ++ unsigned long flags; ++ ++ if (!IO_APIC_IRQ(irq)) { ++ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ++ ioapic); ++ return -EINVAL; ++ } ++ ++ /* ++ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. ++ * Note that we mask (disable) IRQs now -- these get enabled when the ++ * corresponding device driver registers for this IRQ. ++ */ ++ ++ memset(&entry,0,sizeof(entry)); ++ ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.dest_mode = INT_DEST_MODE; ++ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); ++ entry.trigger = edge_level; ++ entry.polarity = active_high_low; ++ entry.mask = 1; ++ ++ /* ++ * IRQs < 16 are already in the irq_2_pin[] map ++ */ ++ if (irq >= 16) ++ add_pin_to_irq(irq, ioapic, pin); ++ ++ entry.vector = assign_irq_vector(irq); ++ ++ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " ++ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, ++ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, ++ edge_level, active_high_low); ++ ++ ioapic_register_intr(irq, entry.vector, edge_level); ++ ++ if (!ioapic && (irq < 16)) ++ disable_8259A_irq(irq); ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __ioapic_write_entry(ioapic, pin, entry); ++ set_native_irq_info(irq, TARGET_CPUS); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return 0; ++} ++ ++#endif /* CONFIG_ACPI */ ++ ++static int __init parse_disable_timer_pin_1(char *arg) ++{ ++ disable_timer_pin_1 = 1; ++ return 0; ++} ++early_param("disable_timer_pin_1", parse_disable_timer_pin_1); ++ ++static int __init parse_enable_timer_pin_1(char *arg) ++{ ++ disable_timer_pin_1 = -1; ++ return 0; ++} ++early_param("enable_timer_pin_1", parse_enable_timer_pin_1); ++ ++static int __init parse_noapic(char *arg) ++{ ++ /* disable IO-APIC */ ++ disable_ioapic_setup(); ++ return 0; ++} ++early_param("noapic", parse_noapic); +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/ioport-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/ioport-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,122 @@ ++/* ++ * linux/arch/i386/kernel/ioport.c ++ * ++ * This contains the io-permission bitmap code - written by obz, with changes ++ * by Linus. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ ++static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) ++{ ++ unsigned long mask; ++ unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); ++ unsigned int low_index = base & (BITS_PER_LONG-1); ++ int length = low_index + extent; ++ ++ if (low_index != 0) { ++ mask = (~0UL << low_index); ++ if (length < BITS_PER_LONG) ++ mask &= ~(~0UL << length); ++ if (new_value) ++ *bitmap_base++ |= mask; ++ else ++ *bitmap_base++ &= ~mask; ++ length -= BITS_PER_LONG; ++ } ++ ++ mask = (new_value ? ~0UL : 0UL); ++ while (length >= BITS_PER_LONG) { ++ *bitmap_base++ = mask; ++ length -= BITS_PER_LONG; ++ } ++ ++ if (length > 0) { ++ mask = ~(~0UL << length); ++ if (new_value) ++ *bitmap_base++ |= mask; ++ else ++ *bitmap_base++ &= ~mask; ++ } ++} ++ ++ ++/* ++ * this changes the io permissions bitmap in the current task. ++ */ ++asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) ++{ ++ struct thread_struct * t = ¤t->thread; ++ unsigned long *bitmap; ++ struct physdev_set_iobitmap set_iobitmap; ++ ++ if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) ++ return -EINVAL; ++ if (turn_on && !capable(CAP_SYS_RAWIO)) ++ return -EPERM; ++ ++ /* ++ * If it's the first ioperm() call in this thread's lifetime, set the ++ * IO bitmap up. ioperm() is much less timing critical than clone(), ++ * this is why we delay this operation until now: ++ */ ++ if (!t->io_bitmap_ptr) { ++ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); ++ if (!bitmap) ++ return -ENOMEM; ++ ++ memset(bitmap, 0xff, IO_BITMAP_BYTES); ++ t->io_bitmap_ptr = bitmap; ++ set_thread_flag(TIF_IO_BITMAP); ++ ++ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); ++ set_iobitmap.nr_ports = IO_BITMAP_BITS; ++ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap); ++ } ++ ++ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); ++ ++ return 0; ++} ++ ++/* ++ * sys_iopl has to be used when you want to access the IO ports ++ * beyond the 0x3ff range: to get the full 65536 ports bitmapped ++ * you'd need 8kB of bitmaps/process, which is a bit excessive. ++ * ++ * Here we just change the eflags value on the stack: we allow ++ * only the super-user to do it. This depends on the stack-layout ++ * on system-call entry - see also fork() and the signal handling ++ * code. ++ */ ++ ++asmlinkage long sys_iopl(unsigned long unused) ++{ ++ volatile struct pt_regs * regs = (struct pt_regs *) &unused; ++ unsigned int level = regs->ebx; ++ struct thread_struct *t = ¤t->thread; ++ unsigned int old = (t->iopl >> 12) & 3; ++ ++ if (level > 3) ++ return -EINVAL; ++ /* Trying to gain more privileges? */ ++ if (level > old) { ++ if (!capable(CAP_SYS_RAWIO)) ++ return -EPERM; ++ } ++ t->iopl = level << 12; ++ set_iopl_mask(t->iopl); ++ return 0; ++} +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/irq.c +--- a/arch/i386/kernel/irq.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/irq.c Wed Aug 08 16:25:28 2007 -0300 +@@ -288,7 +288,9 @@ skip: + } + + #ifdef CONFIG_HOTPLUG_CPU ++#ifndef CONFIG_XEN + #include ++#endif + + void fixup_irqs(cpumask_t map) + { +@@ -302,7 +304,9 @@ void fixup_irqs(cpumask_t map) + + cpus_and(mask, irq_desc[irq].affinity, map); + if (any_online_cpu(mask) == NR_CPUS) { ++#ifndef CONFIG_XEN + printk("Breaking affinity for irq %i\n", irq); ++#endif + mask = map; + } + if (irq_desc[irq].chip->set_affinity) +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/ldt-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/ldt-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,268 @@ ++/* ++ * linux/arch/i386/kernel/ldt.c ++ * ++ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds ++ * Copyright (C) 1999 Ingo Molnar ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ ++static void flush_ldt(void *null) ++{ ++ if (current->active_mm) ++ load_LDT(¤t->active_mm->context); ++} ++#endif ++ ++static int alloc_ldt(mm_context_t *pc, int mincount, int reload) ++{ ++ void *oldldt; ++ void *newldt; ++ int oldsize; ++ ++ if (mincount <= pc->size) ++ return 0; ++ oldsize = pc->size; ++ mincount = (mincount+511)&(~511); ++ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) ++ newldt = vmalloc(mincount*LDT_ENTRY_SIZE); ++ else ++ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); ++ ++ if (!newldt) ++ return -ENOMEM; ++ ++ if (oldsize) ++ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); ++ oldldt = pc->ldt; ++ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); ++ pc->ldt = newldt; ++ wmb(); ++ pc->size = mincount; ++ wmb(); ++ ++ if (reload) { ++#ifdef CONFIG_SMP ++ cpumask_t mask; ++ preempt_disable(); ++#endif ++ make_pages_readonly( ++ pc->ldt, ++ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ load_LDT(pc); ++#ifdef CONFIG_SMP ++ mask = cpumask_of_cpu(smp_processor_id()); ++ if (!cpus_equal(current->mm->cpu_vm_mask, mask)) ++ smp_call_function(flush_ldt, NULL, 1, 1); ++ preempt_enable(); ++#endif ++ } ++ if (oldsize) { ++ make_pages_writable( ++ oldldt, ++ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(oldldt); ++ else ++ kfree(oldldt); ++ } ++ return 0; ++} ++ ++static inline int copy_ldt(mm_context_t *new, mm_context_t *old) ++{ ++ int err = alloc_ldt(new, old->size, 0); ++ if (err < 0) ++ return err; ++ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); ++ make_pages_readonly( ++ new->ldt, ++ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ return 0; ++} ++ ++/* ++ * we do not have to muck with descriptors here, that is ++ * done in switch_mm() as needed. ++ */ ++int init_new_context(struct task_struct *tsk, struct mm_struct *mm) ++{ ++ struct mm_struct * old_mm; ++ int retval = 0; ++ ++ init_MUTEX(&mm->context.sem); ++ mm->context.size = 0; ++ mm->context.has_foreign_mappings = 0; ++ old_mm = current->mm; ++ if (old_mm && old_mm->context.size > 0) { ++ down(&old_mm->context.sem); ++ retval = copy_ldt(&mm->context, &old_mm->context); ++ up(&old_mm->context.sem); ++ } ++ return retval; ++} ++ ++/* ++ * No need to lock the MM as we are the last user ++ */ ++void destroy_context(struct mm_struct *mm) ++{ ++ if (mm->context.size) { ++ if (mm == current->active_mm) ++ clear_LDT(); ++ make_pages_writable( ++ mm->context.ldt, ++ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(mm->context.ldt); ++ else ++ kfree(mm->context.ldt); ++ mm->context.size = 0; ++ } ++} ++ ++static int read_ldt(void __user * ptr, unsigned long bytecount) ++{ ++ int err; ++ unsigned long size; ++ struct mm_struct * mm = current->mm; ++ ++ if (!mm->context.size) ++ return 0; ++ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) ++ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; ++ ++ down(&mm->context.sem); ++ size = mm->context.size*LDT_ENTRY_SIZE; ++ if (size > bytecount) ++ size = bytecount; ++ ++ err = 0; ++ if (copy_to_user(ptr, mm->context.ldt, size)) ++ err = -EFAULT; ++ up(&mm->context.sem); ++ if (err < 0) ++ goto error_return; ++ if (size != bytecount) { ++ /* zero-fill the rest */ ++ if (clear_user(ptr+size, bytecount-size) != 0) { ++ err = -EFAULT; ++ goto error_return; ++ } ++ } ++ return bytecount; ++error_return: ++ return err; ++} ++ ++static int read_default_ldt(void __user * ptr, unsigned long bytecount) ++{ ++ int err; ++ unsigned long size; ++ ++ err = 0; ++ size = 5*sizeof(struct desc_struct); ++ if (size > bytecount) ++ size = bytecount; ++ ++ err = size; ++ if (clear_user(ptr, size)) ++ err = -EFAULT; ++ ++ return err; ++} ++ ++static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) ++{ ++ struct mm_struct * mm = current->mm; ++ __u32 entry_1, entry_2; ++ int error; ++ struct user_desc ldt_info; ++ ++ error = -EINVAL; ++ if (bytecount != sizeof(ldt_info)) ++ goto out; ++ error = -EFAULT; ++ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) ++ goto out; ++ ++ error = -EINVAL; ++ if (ldt_info.entry_number >= LDT_ENTRIES) ++ goto out; ++ if (ldt_info.contents == 3) { ++ if (oldmode) ++ goto out; ++ if (ldt_info.seg_not_present == 0) ++ goto out; ++ } ++ ++ down(&mm->context.sem); ++ if (ldt_info.entry_number >= mm->context.size) { ++ error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); ++ if (error < 0) ++ goto out_unlock; ++ } ++ ++ /* Allow LDTs to be cleared by the user. */ ++ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { ++ if (oldmode || LDT_empty(&ldt_info)) { ++ entry_1 = 0; ++ entry_2 = 0; ++ goto install; ++ } ++ } ++ ++ entry_1 = LDT_entry_a(&ldt_info); ++ entry_2 = LDT_entry_b(&ldt_info); ++ if (oldmode) ++ entry_2 &= ~(1 << 20); ++ ++ /* Install the new entry ... */ ++install: ++ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, ++ entry_1, entry_2); ++ ++out_unlock: ++ up(&mm->context.sem); ++out: ++ return error; ++} ++ ++asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) ++{ ++ int ret = -ENOSYS; ++ ++ switch (func) { ++ case 0: ++ ret = read_ldt(ptr, bytecount); ++ break; ++ case 1: ++ ret = write_ldt(ptr, bytecount, 1); ++ break; ++ case 2: ++ ret = read_default_ldt(ptr, bytecount); ++ break; ++ case 0x11: ++ ret = write_ldt(ptr, bytecount, 0); ++ break; ++ } ++ return ret; ++} +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/machine_kexec.c +--- a/arch/i386/kernel/machine_kexec.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/machine_kexec.c Wed Aug 08 16:25:28 2007 -0300 +@@ -20,6 +20,10 @@ + #include + #include + ++#ifdef CONFIG_XEN ++#include ++#endif ++ + #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + static u32 kexec_pgd[1024] PAGE_ALIGNED; + #ifdef CONFIG_X86_PAE +@@ -28,6 +32,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED + #endif + static u32 kexec_pte0[1024] PAGE_ALIGNED; + static u32 kexec_pte1[1024] PAGE_ALIGNED; ++ ++#ifdef CONFIG_XEN ++ ++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) ++ ++#if PAGES_NR > KEXEC_XEN_NO_PAGES ++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break ++#endif ++ ++#if PA_CONTROL_PAGE != 0 ++#error PA_CONTROL_PAGE is non zero - Xen support will break ++#endif ++ ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) ++{ ++ void *control_page; ++ ++ memset(xki->page_list, 0, sizeof(xki->page_list)); ++ ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); ++ ++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); ++ xki->page_list[PA_PGD] = __ma(kexec_pgd); ++#ifdef CONFIG_X86_PAE ++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); ++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); ++#endif ++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); ++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); ++ ++} ++ ++#else /* CONFIG_XEN */ + + static void set_idt(void *newidt, __u16 limit) + { +@@ -70,6 +108,7 @@ static void load_segments(void) + #undef STR + #undef __STR + } ++#endif /* !CONFIG_XEN */ + + /* + * A architecture hook called to validate the +@@ -97,6 +136,7 @@ void machine_kexec_cleanup(struct kimage + { + } + ++#ifndef CONFIG_XEN + /* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. +@@ -147,6 +187,7 @@ NORET_TYPE void machine_kexec(struct kim + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); + } ++#endif + + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/microcode-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/microcode-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,144 @@ ++/* ++ * Intel CPU Microcode Update Driver for Linux ++ * ++ * Copyright (C) 2000-2004 Tigran Aivazian ++ * ++ * This driver allows to upgrade microcode on Intel processors ++ * belonging to IA-32 family - PentiumPro, Pentium II, ++ * Pentium III, Xeon, Pentium 4, etc. ++ * ++ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, ++ * Order Number 245472 or free download from: ++ * ++ * http://developer.intel.com/design/pentium4/manuals/245472.htm ++ * ++ * For more information, go to http://www.urbanmyth.org/microcode ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++//#define DEBUG /* pr_debug */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); ++MODULE_AUTHOR("Tigran Aivazian "); ++MODULE_LICENSE("GPL"); ++ ++static int verbose; ++module_param(verbose, int, 0644); ++ ++#define MICROCODE_VERSION "1.14a-xen" ++ ++#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ ++#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ ++#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ ++ ++/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ ++static DEFINE_MUTEX(microcode_mutex); ++ ++static int microcode_open (struct inode *unused1, struct file *unused2) ++{ ++ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; ++} ++ ++ ++static int do_microcode_update (const void __user *ubuf, size_t len) ++{ ++ int err; ++ void *kbuf; ++ ++ kbuf = vmalloc(len); ++ if (!kbuf) ++ return -ENOMEM; ++ ++ if (copy_from_user(kbuf, ubuf, len) == 0) { ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_microcode_update; ++ set_xen_guest_handle(op.u.microcode.data, kbuf); ++ op.u.microcode.length = len; ++ err = HYPERVISOR_platform_op(&op); ++ } else ++ err = -EFAULT; ++ ++ vfree(kbuf); ++ ++ return err; ++} ++ ++static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) ++{ ++ ssize_t ret; ++ ++ if (len < MC_HEADER_SIZE) { ++ printk(KERN_ERR "microcode: not enough data\n"); ++ return -EINVAL; ++ } ++ ++ mutex_lock(µcode_mutex); ++ ++ ret = do_microcode_update(buf, len); ++ if (!ret) ++ ret = (ssize_t)len; ++ ++ mutex_unlock(µcode_mutex); ++ ++ return ret; ++} ++ ++static struct file_operations microcode_fops = { ++ .owner = THIS_MODULE, ++ .write = microcode_write, ++ .open = microcode_open, ++}; ++ ++static struct miscdevice microcode_dev = { ++ .minor = MICROCODE_MINOR, ++ .name = "microcode", ++ .fops = µcode_fops, ++}; ++ ++static int __init microcode_init (void) ++{ ++ int error; ++ ++ error = misc_register(µcode_dev); ++ if (error) { ++ printk(KERN_ERR ++ "microcode: can't misc_register on minor=%d\n", ++ MICROCODE_MINOR); ++ return error; ++ } ++ ++ printk(KERN_INFO ++ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); ++ return 0; ++} ++ ++static void __exit microcode_exit (void) ++{ ++ misc_deregister(µcode_dev); ++} ++ ++module_init(microcode_init) ++module_exit(microcode_exit) ++MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/mpparse.c +--- a/arch/i386/kernel/mpparse.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/mpparse.c Wed Aug 08 16:25:28 2007 -0300 +@@ -106,6 +106,7 @@ static struct mpc_config_translation *tr + + static void __cpuinit MP_processor_info (struct mpc_config_processor *m) + { ++#ifndef CONFIG_XEN + int ver, apicid; + physid_mask_t phys_cpu; + +@@ -196,8 +197,9 @@ static void __cpuinit MP_processor_info + } + + cpu_set(num_processors, cpu_possible_map); ++#endif /* CONFIG_XEN */ + num_processors++; +- ++#ifndef CONFIG_XEN + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc +@@ -218,6 +220,7 @@ static void __cpuinit MP_processor_info + } + } + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; ++#endif /* CONFIG_XEN */ + } + + static void __init MP_bus_info (struct mpc_config_bus *m) +@@ -684,7 +687,11 @@ void __init get_smp_config (void) + * Read the physical hardware table. Anything here will + * override the defaults. + */ ++#ifdef CONFIG_XEN ++ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { ++#else + if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { ++#endif + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); +@@ -719,7 +726,11 @@ void __init get_smp_config (void) + + static int __init smp_scan_config (unsigned long base, unsigned long length) + { ++#ifdef CONFIG_XEN ++ unsigned long *bp = isa_bus_to_virt(base); ++#else + unsigned long *bp = phys_to_virt(base); ++#endif + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); +@@ -735,6 +746,7 @@ static int __init smp_scan_config (unsig + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; ++#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at %08lx\n", + virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); +@@ -754,6 +766,10 @@ static int __init smp_scan_config (unsig + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size); + } ++#else ++ printk(KERN_INFO "found SMP MP-table at %08lx\n", ++ ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); ++#endif + + mpf_found = mpf; + return 1; +@@ -766,7 +782,9 @@ static int __init smp_scan_config (unsig + + void __init find_smp_config (void) + { ++#ifndef CONFIG_XEN + unsigned int address; ++#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. +@@ -797,9 +815,11 @@ void __init find_smp_config (void) + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + ++#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); ++#endif + } + + int es7000_plat; +@@ -812,6 +832,7 @@ int es7000_plat; + + void __init mp_register_lapic_address(u64 address) + { ++#ifndef CONFIG_XEN + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); +@@ -820,6 +841,7 @@ void __init mp_register_lapic_address(u6 + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); ++#endif + } + + void __cpuinit mp_register_lapic (u8 id, u8 enabled) +@@ -836,6 +858,7 @@ void __cpuinit mp_register_lapic (u8 id, + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + ++#ifndef CONFIG_XEN + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); +@@ -846,6 +869,7 @@ void __cpuinit mp_register_lapic (u8 id, + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; ++#endif + + MP_processor_info(&processor); + } +@@ -900,7 +924,9 @@ void __init mp_register_ioapic(u8 id, u3 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + ++#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); ++#endif + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + tmpid = io_apic_get_unique_id(idx, id); +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/pci-dma-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/pci-dma-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,378 @@ ++/* ++ * Dynamic DMA mapping support. ++ * ++ * On i386 there is no hardware dynamic DMA address translation, ++ * so consistent alloc/free are merely page allocation/freeing. ++ * The rest of the dynamic DMA mapping interface is implemented ++ * in asm/pci.h. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __x86_64__ ++#include ++ ++int iommu_merge __read_mostly = 0; ++EXPORT_SYMBOL(iommu_merge); ++ ++dma_addr_t bad_dma_address __read_mostly; ++EXPORT_SYMBOL(bad_dma_address); ++ ++/* This tells the BIO block layer to assume merging. Default to off ++ because we cannot guarantee merging later. */ ++int iommu_bio_merge __read_mostly = 0; ++EXPORT_SYMBOL(iommu_bio_merge); ++ ++int iommu_sac_force __read_mostly = 0; ++EXPORT_SYMBOL(iommu_sac_force); ++ ++int no_iommu __read_mostly; ++#ifdef CONFIG_IOMMU_DEBUG ++int panic_on_overflow __read_mostly = 1; ++int force_iommu __read_mostly = 1; ++#else ++int panic_on_overflow __read_mostly = 0; ++int force_iommu __read_mostly= 0; ++#endif ++ ++/* Set this to 1 if there is a HW IOMMU in the system */ ++int iommu_detected __read_mostly = 0; ++ ++void __init pci_iommu_alloc(void) ++{ ++ /* ++ * The order of these functions is important for ++ * fall-back/fail-over reasons ++ */ ++#ifdef CONFIG_IOMMU ++ iommu_hole_init(); ++#endif ++ ++#ifdef CONFIG_CALGARY_IOMMU ++#include ++ /* shut up compiler */ ++ use_calgary = use_calgary; ++ detect_calgary(); ++#endif ++ ++#ifdef CONFIG_SWIOTLB ++ pci_swiotlb_init(); ++#endif ++} ++ ++#endif ++ ++struct dma_coherent_mem { ++ void *virt_base; ++ u32 device_base; ++ int size; ++ int flags; ++ unsigned long *bitmap; ++}; ++ ++#define IOMMU_BUG_ON(test) \ ++do { \ ++ if (unlikely(test)) { \ ++ printk(KERN_ALERT "Fatal DMA error! " \ ++ "Please use 'swiotlb=force'\n"); \ ++ BUG(); \ ++ } \ ++} while (0) ++ ++int ++dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, ++ enum dma_data_direction direction) ++{ ++ int i, rc; ++ ++ BUG_ON(!valid_dma_direction(direction)); ++ WARN_ON(nents == 0 || sg[0].length == 0); ++ ++ if (swiotlb) { ++ rc = swiotlb_map_sg(hwdev, sg, nents, direction); ++ } else { ++ for (i = 0; i < nents; i++ ) { ++ sg[i].dma_address = ++ page_to_bus(sg[i].page) + sg[i].offset; ++ sg[i].dma_length = sg[i].length; ++ BUG_ON(!sg[i].page); ++ IOMMU_BUG_ON(address_needs_mapping( ++ hwdev, sg[i].dma_address)); ++ } ++ rc = nents; ++ } ++ ++ flush_write_buffers(); ++ return rc; ++} ++EXPORT_SYMBOL(dma_map_sg); ++ ++void ++dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(!valid_dma_direction(direction)); ++ if (swiotlb) ++ swiotlb_unmap_sg(hwdev, sg, nents, direction); ++} ++EXPORT_SYMBOL(dma_unmap_sg); ++ ++#ifdef CONFIG_HIGHMEM ++dma_addr_t ++dma_map_page(struct device *dev, struct page *page, unsigned long offset, ++ size_t size, enum dma_data_direction direction) ++{ ++ dma_addr_t dma_addr; ++ ++ BUG_ON(!valid_dma_direction(direction)); ++ ++ if (swiotlb) { ++ dma_addr = swiotlb_map_page( ++ dev, page, offset, size, direction); ++ } else { ++ dma_addr = page_to_bus(page) + offset; ++ IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); ++ } ++ ++ return dma_addr; ++} ++EXPORT_SYMBOL(dma_map_page); ++ ++void ++dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(!valid_dma_direction(direction)); ++ if (swiotlb) ++ swiotlb_unmap_page(dev, dma_address, size, direction); ++} ++EXPORT_SYMBOL(dma_unmap_page); ++#endif /* CONFIG_HIGHMEM */ ++ ++int ++dma_mapping_error(dma_addr_t dma_addr) ++{ ++ if (swiotlb) ++ return swiotlb_dma_mapping_error(dma_addr); ++ return 0; ++} ++EXPORT_SYMBOL(dma_mapping_error); ++ ++int ++dma_supported(struct device *dev, u64 mask) ++{ ++ if (swiotlb) ++ return swiotlb_dma_supported(dev, mask); ++ /* ++ * By default we'll BUG when an infeasible DMA is requested, and ++ * request swiotlb=force (see IOMMU_BUG_ON). ++ */ ++ return 1; ++} ++EXPORT_SYMBOL(dma_supported); ++ ++void *dma_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t *dma_handle, gfp_t gfp) ++{ ++ void *ret; ++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; ++ unsigned int order = get_order(size); ++ unsigned long vstart; ++ u64 mask; ++ ++ /* ignore region specifiers */ ++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); ++ ++ if (mem) { ++ int page = bitmap_find_free_region(mem->bitmap, mem->size, ++ order); ++ if (page >= 0) { ++ *dma_handle = mem->device_base + (page << PAGE_SHIFT); ++ ret = mem->virt_base + (page << PAGE_SHIFT); ++ memset(ret, 0, size); ++ return ret; ++ } ++ if (mem->flags & DMA_MEMORY_EXCLUSIVE) ++ return NULL; ++ } ++ ++ if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) ++ gfp |= GFP_DMA; ++ ++ vstart = __get_free_pages(gfp, order); ++ ret = (void *)vstart; ++ ++ if (dev != NULL && dev->coherent_dma_mask) ++ mask = dev->coherent_dma_mask; ++ else ++ mask = 0xffffffff; ++ ++ if (ret != NULL) { ++ if (xen_create_contiguous_region(vstart, order, ++ fls64(mask)) != 0) { ++ free_pages(vstart, order); ++ return NULL; ++ } ++ memset(ret, 0, size); ++ *dma_handle = virt_to_bus(ret); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(dma_alloc_coherent); ++ ++void dma_free_coherent(struct device *dev, size_t size, ++ void *vaddr, dma_addr_t dma_handle) ++{ ++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; ++ int order = get_order(size); ++ ++ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { ++ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; ++ ++ bitmap_release_region(mem->bitmap, page, order); ++ } else { ++ xen_destroy_contiguous_region((unsigned long)vaddr, order); ++ free_pages((unsigned long)vaddr, order); ++ } ++} ++EXPORT_SYMBOL(dma_free_coherent); ++ ++#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY ++int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, ++ dma_addr_t device_addr, size_t size, int flags) ++{ ++ void __iomem *mem_base = NULL; ++ int pages = size >> PAGE_SHIFT; ++ int bitmap_size = (pages + 31)/32; ++ ++ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) ++ goto out; ++ if (!size) ++ goto out; ++ if (dev->dma_mem) ++ goto out; ++ ++ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ ++ ++ mem_base = ioremap(bus_addr, size); ++ if (!mem_base) ++ goto out; ++ ++ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); ++ if (!dev->dma_mem) ++ goto out; ++ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); ++ if (!dev->dma_mem->bitmap) ++ goto free1_out; ++ ++ dev->dma_mem->virt_base = mem_base; ++ dev->dma_mem->device_base = device_addr; ++ dev->dma_mem->size = pages; ++ dev->dma_mem->flags = flags; ++ ++ if (flags & DMA_MEMORY_MAP) ++ return DMA_MEMORY_MAP; ++ ++ return DMA_MEMORY_IO; ++ ++ free1_out: ++ kfree(dev->dma_mem->bitmap); ++ out: ++ if (mem_base) ++ iounmap(mem_base); ++ return 0; ++} ++EXPORT_SYMBOL(dma_declare_coherent_memory); ++ ++void dma_release_declared_memory(struct device *dev) ++{ ++ struct dma_coherent_mem *mem = dev->dma_mem; ++ ++ if(!mem) ++ return; ++ dev->dma_mem = NULL; ++ iounmap(mem->virt_base); ++ kfree(mem->bitmap); ++ kfree(mem); ++} ++EXPORT_SYMBOL(dma_release_declared_memory); ++ ++void *dma_mark_declared_memory_occupied(struct device *dev, ++ dma_addr_t device_addr, size_t size) ++{ ++ struct dma_coherent_mem *mem = dev->dma_mem; ++ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ int pos, err; ++ ++ if (!mem) ++ return ERR_PTR(-EINVAL); ++ ++ pos = (device_addr - mem->device_base) >> PAGE_SHIFT; ++ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); ++ if (err != 0) ++ return ERR_PTR(err); ++ return mem->virt_base + (pos << PAGE_SHIFT); ++} ++EXPORT_SYMBOL(dma_mark_declared_memory_occupied); ++#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ ++ ++dma_addr_t ++dma_map_single(struct device *dev, void *ptr, size_t size, ++ enum dma_data_direction direction) ++{ ++ dma_addr_t dma; ++ ++ BUG_ON(!valid_dma_direction(direction)); ++ WARN_ON(size == 0); ++ ++ if (swiotlb) { ++ dma = swiotlb_map_single(dev, ptr, size, direction); ++ } else { ++ dma = virt_to_bus(ptr); ++ IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size)); ++ IOMMU_BUG_ON(address_needs_mapping(dev, dma)); ++ } ++ ++ flush_write_buffers(); ++ return dma; ++} ++EXPORT_SYMBOL(dma_map_single); ++ ++void ++dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(!valid_dma_direction(direction)); ++ if (swiotlb) ++ swiotlb_unmap_single(dev, dma_addr, size, direction); ++} ++EXPORT_SYMBOL(dma_unmap_single); ++ ++void ++dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction direction) ++{ ++ if (swiotlb) ++ swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); ++} ++EXPORT_SYMBOL(dma_sync_single_for_cpu); ++ ++void ++dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction direction) ++{ ++ if (swiotlb) ++ swiotlb_sync_single_for_device(dev, dma_handle, size, direction); ++} ++EXPORT_SYMBOL(dma_sync_single_for_device); +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/process-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/process-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,884 @@ ++/* ++ * linux/arch/i386/kernel/process.c ++ * ++ * Copyright (C) 1995 Linus Torvalds ++ * ++ * Pentium III FXSR, SSE support ++ * Gareth Hughes , May 2000 ++ */ ++ ++/* ++ * This file handles the architecture-dependent parts of process handling.. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_MATH_EMULATION ++#include ++#endif ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++ ++asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); ++ ++static int hlt_counter; ++ ++unsigned long boot_option_idle_override = 0; ++EXPORT_SYMBOL(boot_option_idle_override); ++ ++/* ++ * Return saved PC of a blocked thread. ++ */ ++unsigned long thread_saved_pc(struct task_struct *tsk) ++{ ++ return ((unsigned long *)tsk->thread.esp)[3]; ++} ++ ++/* ++ * Powermanagement idle function, if any.. ++ */ ++void (*pm_idle)(void); ++EXPORT_SYMBOL(pm_idle); ++static DEFINE_PER_CPU(unsigned int, cpu_idle_state); ++ ++void disable_hlt(void) ++{ ++ hlt_counter++; ++} ++ ++EXPORT_SYMBOL(disable_hlt); ++ ++void enable_hlt(void) ++{ ++ hlt_counter--; ++} ++ ++EXPORT_SYMBOL(enable_hlt); ++ ++/* ++ * On SMP it's slightly faster (but much more power-consuming!) ++ * to poll the ->work.need_resched flag instead of waiting for the ++ * cross-CPU IPI to arrive. Use this option with caution. ++ */ ++static void poll_idle (void) ++{ ++ local_irq_enable(); ++ ++ asm volatile( ++ "2:" ++ "testl %0, %1;" ++ "rep; nop;" ++ "je 2b;" ++ : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); ++} ++ ++static void xen_idle(void) ++{ ++ current_thread_info()->status &= ~TS_POLLING; ++ /* ++ * TS_POLLING-cleared state must be visible before we ++ * test NEED_RESCHED: ++ */ ++ smp_mb(); ++ ++ local_irq_disable(); ++ if (!need_resched()) ++ safe_halt(); /* enables interrupts racelessly */ ++ else ++ local_irq_enable(); ++ current_thread_info()->status |= TS_POLLING; ++} ++#ifdef CONFIG_APM_MODULE ++EXPORT_SYMBOL(default_idle); ++#endif ++ ++#ifdef CONFIG_HOTPLUG_CPU ++extern cpumask_t cpu_initialized; ++static inline void play_dead(void) ++{ ++ idle_task_exit(); ++ local_irq_disable(); ++ cpu_clear(smp_processor_id(), cpu_initialized); ++ preempt_enable_no_resched(); ++ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); ++ cpu_bringup(); ++} ++#else ++static inline void play_dead(void) ++{ ++ BUG(); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++/* ++ * The idle thread. There's no useful work to be ++ * done, so just try to conserve power and have a ++ * low exit latency (ie sit in a loop waiting for ++ * somebody to say that they'd like to reschedule) ++ */ ++void cpu_idle(void) ++{ ++ int cpu = smp_processor_id(); ++ ++ current_thread_info()->status |= TS_POLLING; ++ ++ /* endless idle loop with no priority at all */ ++ while (1) { ++ while (!need_resched()) { ++ void (*idle)(void); ++ ++ if (__get_cpu_var(cpu_idle_state)) ++ __get_cpu_var(cpu_idle_state) = 0; ++ ++ rmb(); ++ idle = xen_idle; /* no alternatives */ ++ ++ if (cpu_is_offline(cpu)) ++ play_dead(); ++ ++ __get_cpu_var(irq_stat).idle_timestamp = jiffies; ++ idle(); ++ } ++ preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++ } ++} ++ ++void cpu_idle_wait(void) ++{ ++ unsigned int cpu, this_cpu = get_cpu(); ++ cpumask_t map, tmp = current->cpus_allowed; ++ ++ set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); ++ put_cpu(); ++ ++ cpus_clear(map); ++ for_each_online_cpu(cpu) { ++ per_cpu(cpu_idle_state, cpu) = 1; ++ cpu_set(cpu, map); ++ } ++ ++ __get_cpu_var(cpu_idle_state) = 0; ++ ++ wmb(); ++ do { ++ ssleep(1); ++ for_each_online_cpu(cpu) { ++ if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) ++ cpu_clear(cpu, map); ++ } ++ cpus_and(map, map, cpu_online_map); ++ } while (!cpus_empty(map)); ++ ++ set_cpus_allowed(current, tmp); ++} ++EXPORT_SYMBOL_GPL(cpu_idle_wait); ++ ++/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */ ++/* Always use xen_idle() instead. */ ++void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) {} ++ ++void __devinit select_idle_routine(const struct cpuinfo_x86 *c) ++{ ++} ++ ++static int __init idle_setup (char *str) ++{ ++ if (!strncmp(str, "poll", 4)) { ++ printk("using polling idle threads.\n"); ++ pm_idle = poll_idle; ++ } ++ ++ boot_option_idle_override = 1; ++ return 1; ++} ++ ++__setup("idle=", idle_setup); ++ ++void show_regs(struct pt_regs * regs) ++{ ++ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; ++ ++ printk("\n"); ++ printk("Pid: %d, comm: %20s\n", current->pid, current->comm); ++ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); ++ print_symbol("EIP is at %s\n", regs->eip); ++ ++ if (user_mode_vm(regs)) ++ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); ++ printk(" EFLAGS: %08lx %s (%s %.*s)\n", ++ regs->eflags, print_tainted(), init_utsname()->release, ++ (int)strcspn(init_utsname()->version, " "), ++ init_utsname()->version); ++ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", ++ regs->eax,regs->ebx,regs->ecx,regs->edx); ++ printk("ESI: %08lx EDI: %08lx EBP: %08lx", ++ regs->esi, regs->edi, regs->ebp); ++ printk(" DS: %04x ES: %04x GS: %04x\n", ++ 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); ++ ++ cr0 = read_cr0(); ++ cr2 = read_cr2(); ++ cr3 = read_cr3(); ++ cr4 = read_cr4_safe(); ++ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); ++ show_trace(NULL, regs, ®s->esp); ++} ++ ++/* ++ * This gets run with %ebx containing the ++ * function to call, and %edx containing ++ * the "args". ++ */ ++extern void kernel_thread_helper(void); ++ ++/* ++ * Create a kernel thread ++ */ ++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++{ ++ struct pt_regs regs; ++ ++ memset(®s, 0, sizeof(regs)); ++ ++ regs.ebx = (unsigned long) fn; ++ regs.edx = (unsigned long) arg; ++ ++ regs.xds = __USER_DS; ++ regs.xes = __USER_DS; ++ regs.xgs = __KERNEL_PDA; ++ regs.orig_eax = -1; ++ regs.eip = (unsigned long) kernel_thread_helper; ++ regs.xcs = __KERNEL_CS | get_kernel_rpl(); ++ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; ++ ++ /* Ok, create the new process.. */ ++ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); ++} ++EXPORT_SYMBOL(kernel_thread); ++ ++/* ++ * Free current thread data structures etc.. ++ */ ++void exit_thread(void) ++{ ++ /* The process may have allocated an io port bitmap... nuke it. */ ++ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { ++ struct task_struct *tsk = current; ++ struct thread_struct *t = &tsk->thread; ++ struct physdev_set_iobitmap set_iobitmap; ++ memset(&set_iobitmap, 0, sizeof(set_iobitmap)); ++ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap); ++ kfree(t->io_bitmap_ptr); ++ t->io_bitmap_ptr = NULL; ++ clear_thread_flag(TIF_IO_BITMAP); ++ } ++} ++ ++void flush_thread(void) ++{ ++ struct task_struct *tsk = current; ++ ++ memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); ++ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); ++ clear_tsk_thread_flag(tsk, TIF_DEBUG); ++ /* ++ * Forget coprocessor state.. ++ */ ++ clear_fpu(tsk); ++ clear_used_math(); ++} ++ ++void release_thread(struct task_struct *dead_task) ++{ ++ BUG_ON(dead_task->mm); ++ release_vm86_irqs(dead_task); ++} ++ ++/* ++ * This gets called before we allocate a new thread and copy ++ * the current task into it. ++ */ ++void prepare_to_copy(struct task_struct *tsk) ++{ ++ unlazy_fpu(tsk); ++} ++ ++int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, ++ unsigned long unused, ++ struct task_struct * p, struct pt_regs * regs) ++{ ++ struct pt_regs * childregs; ++ struct task_struct *tsk; ++ int err; ++ ++ childregs = task_pt_regs(p); ++ *childregs = *regs; ++ childregs->eax = 0; ++ childregs->esp = esp; ++ ++ p->thread.esp = (unsigned long) childregs; ++ p->thread.esp0 = (unsigned long) (childregs+1); ++ ++ p->thread.eip = (unsigned long) ret_from_fork; ++ ++ savesegment(fs,p->thread.fs); ++ ++ tsk = current; ++ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { ++ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, ++ IO_BITMAP_BYTES, GFP_KERNEL); ++ if (!p->thread.io_bitmap_ptr) { ++ p->thread.io_bitmap_max = 0; ++ return -ENOMEM; ++ } ++ set_tsk_thread_flag(p, TIF_IO_BITMAP); ++ } ++ ++ /* ++ * Set a new TLS for the child thread? ++ */ ++ if (clone_flags & CLONE_SETTLS) { ++ struct desc_struct *desc; ++ struct user_desc info; ++ int idx; ++ ++ err = -EFAULT; ++ if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) ++ goto out; ++ err = -EINVAL; ++ if (LDT_empty(&info)) ++ goto out; ++ ++ idx = info.entry_number; ++ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) ++ goto out; ++ ++ desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; ++ desc->a = LDT_entry_a(&info); ++ desc->b = LDT_entry_b(&info); ++ } ++ ++ p->thread.iopl = current->thread.iopl; ++ ++ err = 0; ++ out: ++ if (err && p->thread.io_bitmap_ptr) { ++ kfree(p->thread.io_bitmap_ptr); ++ p->thread.io_bitmap_max = 0; ++ } ++ return err; ++} ++ ++/* ++ * fill in the user structure for a core dump.. ++ */ ++void dump_thread(struct pt_regs * regs, struct user * dump) ++{ ++ int i; ++ ++/* changed the size calculations - should hopefully work better. lbt */ ++ dump->magic = CMAGIC; ++ dump->start_code = 0; ++ dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); ++ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; ++ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; ++ dump->u_dsize -= dump->u_tsize; ++ dump->u_ssize = 0; ++ for (i = 0; i < 8; i++) ++ dump->u_debugreg[i] = current->thread.debugreg[i]; ++ ++ if (dump->start_stack < TASK_SIZE) ++ dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; ++ ++ dump->regs.ebx = regs->ebx; ++ dump->regs.ecx = regs->ecx; ++ dump->regs.edx = regs->edx; ++ dump->regs.esi = regs->esi; ++ dump->regs.edi = regs->edi; ++ dump->regs.ebp = regs->ebp; ++ dump->regs.eax = regs->eax; ++ dump->regs.ds = regs->xds; ++ dump->regs.es = regs->xes; ++ savesegment(fs,dump->regs.fs); ++ dump->regs.gs = regs->xgs; ++ dump->regs.orig_eax = regs->orig_eax; ++ dump->regs.eip = regs->eip; ++ dump->regs.cs = regs->xcs; ++ dump->regs.eflags = regs->eflags; ++ dump->regs.esp = regs->esp; ++ dump->regs.ss = regs->xss; ++ ++ dump->u_fpvalid = dump_fpu (regs, &dump->i387); ++} ++EXPORT_SYMBOL(dump_thread); ++ ++/* ++ * Capture the user space registers if the task is not running (in user space) ++ */ ++int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) ++{ ++ struct pt_regs ptregs = *task_pt_regs(tsk); ++ ptregs.xcs &= 0xffff; ++ ptregs.xds &= 0xffff; ++ ptregs.xes &= 0xffff; ++ ptregs.xss &= 0xffff; ++ ++ elf_core_copy_regs(regs, &ptregs); ++ ++ return 1; ++} ++ ++static noinline void __switch_to_xtra(struct task_struct *next_p) ++{ ++ struct thread_struct *next; ++ ++ next = &next_p->thread; ++ ++ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { ++ set_debugreg(next->debugreg[0], 0); ++ set_debugreg(next->debugreg[1], 1); ++ set_debugreg(next->debugreg[2], 2); ++ set_debugreg(next->debugreg[3], 3); ++ /* no 4 and 5 */ ++ set_debugreg(next->debugreg[6], 6); ++ set_debugreg(next->debugreg[7], 7); ++ } ++#ifndef CONFIG_XEN ++ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { ++ /* ++ * Disable the bitmap via an invalid offset. We still cache ++ * the previous bitmap owner and the IO bitmap contents: ++ */ ++ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; ++ return; ++ } ++ ++ if (likely(next == tss->io_bitmap_owner)) { ++ /* ++ * Previous owner of the bitmap (hence the bitmap content) ++ * matches the next task, we dont have to do anything but ++ * to set a valid offset in the TSS: ++ */ ++ tss->io_bitmap_base = IO_BITMAP_OFFSET; ++ return; ++ } ++ /* ++ * Lazy TSS's I/O bitmap copy. We set an invalid offset here ++ * and we let the task to get a GPF in case an I/O instruction ++ * is performed. The handler of the GPF will verify that the ++ * faulting task has a valid I/O bitmap and, it true, does the ++ * real copy and restart the instruction. This will save us ++ * redundant copies when the currently switched task does not ++ * perform any I/O during its timeslice. ++ */ ++ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; ++#endif /* !CONFIG_XEN */ ++} ++ ++/* ++ * This function selects if the context switch from prev to next ++ * has to tweak the TSC disable bit in the cr4. ++ */ ++static inline void disable_tsc(struct task_struct *prev_p, ++ struct task_struct *next_p) ++{ ++ struct thread_info *prev, *next; ++ ++ /* ++ * gcc should eliminate the ->thread_info dereference if ++ * has_secure_computing returns 0 at compile time (SECCOMP=n). ++ */ ++ prev = task_thread_info(prev_p); ++ next = task_thread_info(next_p); ++ ++ if (has_secure_computing(prev) || has_secure_computing(next)) { ++ /* slow path here */ ++ if (has_secure_computing(prev) && ++ !has_secure_computing(next)) { ++ write_cr4(read_cr4() & ~X86_CR4_TSD); ++ } else if (!has_secure_computing(prev) && ++ has_secure_computing(next)) ++ write_cr4(read_cr4() | X86_CR4_TSD); ++ } ++} ++ ++/* ++ * switch_to(x,yn) should switch tasks from x to y. ++ * ++ * We fsave/fwait so that an exception goes off at the right time ++ * (as a call from the fsave or fwait in effect) rather than to ++ * the wrong process. Lazy FP saving no longer makes any sense ++ * with modern CPU's, and this simplifies a lot of things (SMP ++ * and UP become the same). ++ * ++ * NOTE! We used to use the x86 hardware context switching. The ++ * reason for not using it any more becomes apparent when you ++ * try to recover gracefully from saved state that is no longer ++ * valid (stale segment register values in particular). With the ++ * hardware task-switch, there is no way to fix up bad state in ++ * a reasonable manner. ++ * ++ * The fact that Intel documents the hardware task-switching to ++ * be slow is a fairly red herring - this code is not noticeably ++ * faster. However, there _is_ some room for improvement here, ++ * so the performance issues may eventually be a valid point. ++ * More important, however, is the fact that this allows us much ++ * more flexibility. ++ * ++ * The return value (in %eax) will be the "prev" task after ++ * the task-switch, and shows up in ret_from_fork in entry.S, ++ * for example. ++ */ ++struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ++{ ++ struct thread_struct *prev = &prev_p->thread, ++ *next = &next_p->thread; ++ int cpu = smp_processor_id(); ++#ifndef CONFIG_X86_NO_TSS ++ struct tss_struct *tss = &per_cpu(init_tss, cpu); ++#endif ++ struct physdev_set_iobitmap iobmp_op; ++ multicall_entry_t _mcl[8], *mcl = _mcl; ++ ++ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ ++ ++ /* ++ * This is basically '__unlazy_fpu', except that we queue a ++ * multicall to indicate FPU task switch, rather than ++ * synchronously trapping to Xen. ++ */ ++ if (prev_p->thread_info->status & TS_USEDFPU) { ++ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ ++ mcl->op = __HYPERVISOR_fpu_taskswitch; ++ mcl->args[0] = 1; ++ mcl++; ++ } ++#if 0 /* lazy fpu sanity check */ ++ else BUG_ON(!(read_cr0() & 8)); ++#endif ++ ++ /* ++ * Reload esp0. ++ * This is load_esp0(tss, next) with a multicall. ++ */ ++ mcl->op = __HYPERVISOR_stack_switch; ++ mcl->args[0] = __KERNEL_DS; ++ mcl->args[1] = next->esp0; ++ mcl++; ++ ++ /* ++ * Load the per-thread Thread-Local Storage descriptor. ++ * This is load_TLS(next, cpu) with multicalls. ++ */ ++#define C(i) do { \ ++ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ ++ next->tls_array[i].b != prev->tls_array[i].b)) { \ ++ mcl->op = __HYPERVISOR_update_descriptor; \ ++ *(u64 *)&mcl->args[0] = virt_to_machine( \ ++ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ ++ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ ++ mcl++; \ ++ } \ ++} while (0) ++ C(0); C(1); C(2); ++#undef C ++ ++ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { ++ set_xen_guest_handle(iobmp_op.bitmap, ++ (char *)next->io_bitmap_ptr); ++ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; ++ mcl->op = __HYPERVISOR_physdev_op; ++ mcl->args[0] = PHYSDEVOP_set_iobitmap; ++ mcl->args[1] = (unsigned long)&iobmp_op; ++ mcl++; ++ } ++ ++ (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); ++ ++ /* ++ * Restore %fs if needed. ++ * ++ * Glibc normally makes %fs be zero. ++ */ ++ if (unlikely(next->fs)) ++ loadsegment(fs, next->fs); ++ ++ write_pda(pcurrent, next_p); ++ ++ /* we're going to use this soon, after a few expensive things */ ++ if (next_p->fpu_counter > 5) ++ prefetch(&next->i387.fxsave); ++ ++ /* ++ * Now maybe handle debug registers ++ */ ++ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) ++ __switch_to_xtra(next_p); ++ ++ disable_tsc(prev_p, next_p); ++ ++ /* If the task has used fpu the last 5 timeslices, just do a full ++ * restore of the math state immediately to avoid the trap; the ++ * chances of needing FPU soon are obviously high now ++ */ ++ if (next_p->fpu_counter > 5) ++ math_state_restore(); ++ ++ return prev_p; ++} ++ ++asmlinkage int sys_fork(struct pt_regs regs) ++{ ++ return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); ++} ++ ++asmlinkage int sys_clone(struct pt_regs regs) ++{ ++ unsigned long clone_flags; ++ unsigned long newsp; ++ int __user *parent_tidptr, *child_tidptr; ++ ++ clone_flags = regs.ebx; ++ newsp = regs.ecx; ++ parent_tidptr = (int __user *)regs.edx; ++ child_tidptr = (int __user *)regs.edi; ++ if (!newsp) ++ newsp = regs.esp; ++ return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); ++} ++ ++/* ++ * This is trivial, and on the face of it looks like it ++ * could equally well be done in user mode. ++ * ++ * Not so, for quite unobvious reasons - register pressure. ++ * In user mode vfork() cannot have a stack frame, and if ++ * done by calling the "clone()" system call directly, you ++ * do not have enough call-clobbered registers to hold all ++ * the information you need. ++ */ ++asmlinkage int sys_vfork(struct pt_regs regs) ++{ ++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); ++} ++ ++/* ++ * sys_execve() executes a new program. ++ */ ++asmlinkage int sys_execve(struct pt_regs regs) ++{ ++ int error; ++ char * filename; ++ ++ filename = getname((char __user *) regs.ebx); ++ error = PTR_ERR(filename); ++ if (IS_ERR(filename)) ++ goto out; ++ error = do_execve(filename, ++ (char __user * __user *) regs.ecx, ++ (char __user * __user *) regs.edx, ++ ®s); ++ if (error == 0) { ++ task_lock(current); ++ current->ptrace &= ~PT_DTRACE; ++ task_unlock(current); ++ /* Make sure we don't return using sysenter.. */ ++ set_thread_flag(TIF_IRET); ++ } ++ putname(filename); ++out: ++ return error; ++} ++ ++#define top_esp (THREAD_SIZE - sizeof(unsigned long)) ++#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) ++ ++unsigned long get_wchan(struct task_struct *p) ++{ ++ unsigned long ebp, esp, eip; ++ unsigned long stack_page; ++ int count = 0; ++ if (!p || p == current || p->state == TASK_RUNNING) ++ return 0; ++ stack_page = (unsigned long)task_stack_page(p); ++ esp = p->thread.esp; ++ if (!stack_page || esp < stack_page || esp > top_esp+stack_page) ++ return 0; ++ /* include/asm-i386/system.h:switch_to() pushes ebp last. */ ++ ebp = *(unsigned long *) esp; ++ do { ++ if (ebp < stack_page || ebp > top_ebp+stack_page) ++ return 0; ++ eip = *(unsigned long *) (ebp+4); ++ if (!in_sched_functions(eip)) ++ return eip; ++ ebp = *(unsigned long *) ebp; ++ } while (count++ < 16); ++ return 0; ++} ++ ++/* ++ * sys_alloc_thread_area: get a yet unused TLS descriptor index. ++ */ ++static int get_free_idx(void) ++{ ++ struct thread_struct *t = ¤t->thread; ++ int idx; ++ ++ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) ++ if (desc_empty(t->tls_array + idx)) ++ return idx + GDT_ENTRY_TLS_MIN; ++ return -ESRCH; ++} ++ ++/* ++ * Set a given TLS descriptor: ++ */ ++asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) ++{ ++ struct thread_struct *t = ¤t->thread; ++ struct user_desc info; ++ struct desc_struct *desc; ++ int cpu, idx; ++ ++ if (copy_from_user(&info, u_info, sizeof(info))) ++ return -EFAULT; ++ idx = info.entry_number; ++ ++ /* ++ * index -1 means the kernel should try to find and ++ * allocate an empty descriptor: ++ */ ++ if (idx == -1) { ++ idx = get_free_idx(); ++ if (idx < 0) ++ return idx; ++ if (put_user(idx, &u_info->entry_number)) ++ return -EFAULT; ++ } ++ ++ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) ++ return -EINVAL; ++ ++ desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; ++ ++ /* ++ * We must not get preempted while modifying the TLS. ++ */ ++ cpu = get_cpu(); ++ ++ if (LDT_empty(&info)) { ++ desc->a = 0; ++ desc->b = 0; ++ } else { ++ desc->a = LDT_entry_a(&info); ++ desc->b = LDT_entry_b(&info); ++ } ++ load_TLS(t, cpu); ++ ++ put_cpu(); ++ ++ return 0; ++} ++ ++/* ++ * Get the current Thread-Local Storage area: ++ */ ++ ++#define GET_BASE(desc) ( \ ++ (((desc)->a >> 16) & 0x0000ffff) | \ ++ (((desc)->b << 16) & 0x00ff0000) | \ ++ ( (desc)->b & 0xff000000) ) ++ ++#define GET_LIMIT(desc) ( \ ++ ((desc)->a & 0x0ffff) | \ ++ ((desc)->b & 0xf0000) ) ++ ++#define GET_32BIT(desc) (((desc)->b >> 22) & 1) ++#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) ++#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) ++#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) ++#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) ++#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) ++ ++asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) ++{ ++ struct user_desc info; ++ struct desc_struct *desc; ++ int idx; ++ ++ if (get_user(idx, &u_info->entry_number)) ++ return -EFAULT; ++ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) ++ return -EINVAL; ++ ++ memset(&info, 0, sizeof(info)); ++ ++ desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; ++ ++ info.entry_number = idx; ++ info.base_addr = GET_BASE(desc); ++ info.limit = GET_LIMIT(desc); ++ info.seg_32bit = GET_32BIT(desc); ++ info.contents = GET_CONTENTS(desc); ++ info.read_exec_only = !GET_WRITABLE(desc); ++ info.limit_in_pages = GET_LIMIT_PAGES(desc); ++ info.seg_not_present = !GET_PRESENT(desc); ++ info.useable = GET_USEABLE(desc); ++ ++ if (copy_to_user(u_info, &info, sizeof(info))) ++ return -EFAULT; ++ return 0; ++} ++ ++unsigned long arch_align_stack(unsigned long sp) ++{ ++ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) ++ sp -= get_random_int() % 8192; ++ return sp & ~0xf; ++} +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/quirks.c +--- a/arch/i386/kernel/quirks.c Wed Jul 18 12:23:24 2007 -0300 ++++ b/arch/i386/kernel/quirks.c Wed Aug 08 16:25:28 2007 -0300 +@@ -7,7 +7,7 @@ + #include + #include + +-#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) ++#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) + static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) + { + u8 config, rev; +@@ -68,11 +68,19 @@ void __init quirk_intel_irqbalance(void) + word = read_pci_config_16(0, 0, 0x40, 0x4c); + + if (!(word & (1 << 13))) { ++#ifdef CONFIG_XEN ++ struct xen_platform_op op; ++ printk(KERN_INFO "Disabling irq balancing and affinity\n"); ++ op.cmd = XENPF_platform_quirk; ++ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; ++ (void)HYPERVISOR_platform_op(&op); ++#else + printk(KERN_INFO "Disabling irq balancing and affinity\n"); + #ifdef CONFIG_IRQBALANCE + irqbalance_disable(""); + #endif + noirqdebug_setup(""); ++#endif /* CONFIG_XEN */ + #ifdef CONFIG_PROC_FS + no_irq_affinity = 1; + #endif +@@ -80,12 +88,12 @@ void __init quirk_intel_irqbalance(void) + printk(KERN_INFO "Disabling cpu hotplug control\n"); + enable_cpu_hotplug = 0; + #endif +-#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + /* force the genapic selection to flat mode so that + * interrupts can be redirected to more than one CPU. + */ + genapic_force = &apic_flat; +-#endif ++#endif /* CONFIG_XEN */ + } + + /* put back the original value for config space */ +diff -r 4a9ef6a03fd9 -r 85b796b085e5 arch/i386/kernel/setup-xen.c +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/arch/i386/kernel/setup-xen.c Wed Aug 08 16:25:28 2007 -0300 +@@ -0,0 +1,845 @@ ++/* ++ * linux/arch/i386/kernel/setup.c ++ * ++ * Copyright (C) 1995 Linus Torvalds ++ * ++ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 ++ * ++ * Memory region support ++ * David Parsons , July-August 1999 ++ * ++ * Added E820 sanitization routine (removes overlapping memory regions); ++ * Brian Moyle , February 2001 ++ * ++ * Moved CPU detection code to cpu/${cpu}.c ++ * Patrick Mochel , March 2002 ++ * ++ * Provisions for empty E820 memory regions (reported by certain BIOSes). ++ * Alex Achenbach , December 2002. ++ * ++ */ ++ ++/* ++ * This file handles the architecture-dependent parts of initialization ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include