From: jbeulich@novell.com Subject: allow use of split page table locks Patch-mainline: obsolete --- arch/i386/mm/pgtable-xen.c | 66 +++++++++++++++++++++++++++++++++++++++--- arch/x86_64/mm/pageattr-xen.c | 66 +++++++++++++++++++++++++++++++++++++++--- mm/Kconfig | 3 - 3 files changed, 124 insertions(+), 11 deletions(-) --- a/arch/i386/mm/pgtable-xen.c 2007-08-27 14:01:27.000000000 -0400 +++ b/arch/i386/mm/pgtable-xen.c 2007-08-27 14:01:27.000000000 -0400 @@ -658,6 +658,64 @@ void make_pages_writable(void *va, unsig } } +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + static inline void pgd_walk_set_prot(struct page *page, pgprot_t flags) { unsigned long pfn = page_to_pfn(page); @@ -740,18 +798,18 @@ void mm_pin(struct mm_struct *mm) { if (xen_feature(XENFEAT_writable_page_tables)) return; - spin_lock(&mm->page_table_lock); + pin_lock(mm); __pgd_pin(mm->pgd); - spin_unlock(&mm->page_table_lock); + pin_unlock(mm); } void mm_unpin(struct mm_struct *mm) { if (xen_feature(XENFEAT_writable_page_tables)) return; - spin_lock(&mm->page_table_lock); + pin_lock(mm); __pgd_unpin(mm->pgd); - spin_unlock(&mm->page_table_lock); + pin_unlock(mm); } void mm_pin_all(void) --- a/arch/x86_64/mm/pageattr-xen.c 2007-08-27 14:01:27.000000000 -0400 +++ b/arch/x86_64/mm/pageattr-xen.c 2007-08-27 14:01:27.000000000 -0400 @@ -20,6 +20,64 @@ LIST_HEAD(mm_unpinned); DEFINE_SPINLOCK(mm_unpinned_lock); +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + static inline void mm_walk_set_prot(void *pt, pgprot_t flags) { struct page *page = virt_to_page(pt); @@ -76,7 +134,7 @@ void mm_pin(struct mm_struct *mm) if (xen_feature(XENFEAT_writable_page_tables)) return; - spin_lock(&mm->page_table_lock); + pin_lock(mm); mm_walk(mm, PAGE_KERNEL_RO); if (HYPERVISOR_update_va_mapping( @@ -97,7 +155,7 @@ void mm_pin(struct mm_struct *mm) list_del(&mm->context.unpinned); spin_unlock(&mm_unpinned_lock); - spin_unlock(&mm->page_table_lock); + pin_unlock(mm); } void mm_unpin(struct mm_struct *mm) @@ -105,7 +163,7 @@ void mm_unpin(struct mm_struct *mm) if (xen_feature(XENFEAT_writable_page_tables)) return; - spin_lock(&mm->page_table_lock); + pin_lock(mm); xen_pgd_unpin(__pa(mm->pgd)); xen_pgd_unpin(__pa(__user_pgd(mm->pgd))); @@ -125,7 +183,7 @@ void mm_unpin(struct mm_struct *mm) list_add(&mm->context.unpinned, &mm_unpinned); spin_unlock(&mm_unpinned_lock); - spin_unlock(&mm->page_table_lock); + pin_unlock(mm); } void mm_pin_all(void) --- a/mm/Kconfig 2007-08-27 14:01:25.000000000 -0400 +++ b/mm/Kconfig 2007-08-27 14:01:27.000000000 -0400 @@ -132,14 +132,11 @@ config MEMORY_HOTPLUG_SPARSE # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. -# XEN on x86 architecture uses the mapping field on pagetable pages to store a -# pointer to the destructor. This conflicts with pte_lock_deinit(). # config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 - default "4096" if X86_XEN || X86_64_XEN default "4" #