From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:34 -0700 Subject: mm, x86, pat: rework linear pfn-mmap tracking Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT. We can toss mapping address from remap_pfn_range() into track_pfn_vma_new(), and collect all PAT-related logic together in arch/x86/. This patch also restores orignal frustration-free is_cow_mapping() check in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73 ("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3") is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c, because it already handled by VM_PFNMAP in VM_NO_THP bit-mask. [suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 311be906b57d..75d1632d3477 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -117,7 +117,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ -#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ +#define VM_PAT 0x40000000 /* PAT reserves whole VMA at once (x86) */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ /* Bits set in the VMA until the stack is in its final location */ @@ -158,24 +158,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -/* - * This interface is used by x86 PAT code to identify a pfn mapping that is - * linear over entire vma. This is to optimize PAT code that deals with - * marking the physical region with a particular prot. This is not for generic - * mm use. Note also that this check will not work if the pfn mapping is - * linear for a vma starting at physical address 0. In which case PAT code - * falls back to slow path of reserving physical range page by page. - */ -static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) -{ - return !!(vma->vm_flags & VM_PFN_AT_MMAP); -} - -static inline int is_pfn_mapping(struct vm_area_struct *vma) -{ - return !!(vma->vm_flags & VM_PFNMAP); -} - /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask -- cgit v1.2.3 From cc2383ec06be093789469852e1fe96e1148e9a2c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:37 -0700 Subject: mm: introduce arch-specific vma flag VM_ARCH_1 Combine several arch-specific vma flags into one. before patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 VM_NOHUGEPAGE VM_HUGEPAGE - VM_PAT powerpc - - VM_SAO - parisc VM_GROWSUP - - - ia64 VM_GROWSUP - - - nommu - VM_MAPPED_COPY - - others - - - - after patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 - VM_PAT VM_HUGEPAGE VM_NOHUGEPAGE powerpc - VM_SAO - - parisc - VM_GROWSUP - - ia64 - VM_GROWSUP - - nommu - VM_MAPPED_COPY - - others - VM_ARCH_1 - - And voila! One completely free bit. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 75d1632d3477..9c039f84b63c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -70,6 +70,8 @@ extern unsigned int kobjsize(const void *objp); /* * vm_flags in vm_area_struct, see mm_types.h. */ +#define VM_NONE 0x00000000 + #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 @@ -82,12 +84,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ -#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) -#define VM_GROWSUP 0x00000200 -#else -#define VM_GROWSUP 0x00000000 -#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ -#endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ @@ -106,20 +102,32 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ -#ifndef CONFIG_TRANSPARENT_HUGEPAGE -#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ -#else -#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ -#endif +#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_NODUMP 0x04000000 /* Do not include in the core dump */ #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ -#define VM_PAT 0x40000000 /* PAT reserves whole VMA at once (x86) */ +#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ +#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#if defined(CONFIG_X86) +# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ +#elif defined(CONFIG_PPC) +# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#elif defined(CONFIG_PARISC) +# define VM_GROWSUP VM_ARCH_1 +#elif defined(CONFIG_IA64) +# define VM_GROWSUP VM_ARCH_1 +#elif !defined(CONFIG_MMU) +# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ +#endif + +#ifndef VM_GROWSUP +# define VM_GROWSUP VM_NONE +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) -- cgit v1.2.3 From 4b6e1e37026ec7dae9b23d78ffcebdd5ddb1bfa1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:40 -0700 Subject: mm: kill vma flag VM_INSERTPAGE Merge VM_INSERTPAGE into VM_MIXEDMAP. VM_MIXEDMAP VMA can mix pure-pfn ptes, special ptes and normal ptes. Now copy_page_range() always copies VM_MIXEDMAP VMA on fork like VM_PFNMAP. If driver populates whole VMA at mmap() it probably not expects page-faults. This patch removes special check from vma_wants_writenotify() which disables pages write tracking for VMA populated via vm_instert_page(). BDI below mapped file should not use dirty-accounting, moreover do_wp_page() can handle this. vm_insert_page() still marks vma after first usage. Usually it is called from f_op->mmap() handler under mm->mmap_sem write-lock, so it able to change vma->vm_flags. Caller must set VM_MIXEDMAP at mmap time if it wants to call this function from other places, for example from page-fault handler. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c039f84b63c..fb0685b17914 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -103,7 +103,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_NODUMP 0x04000000 /* Do not include in the core dump */ #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ -- cgit v1.2.3 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb0685b17914..44d3fc25f556 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,7 +105,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_NODUMP 0x04000000 /* Do not include in the core dump */ -#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ @@ -171,8 +170,7 @@ extern pgprot_t protection_map[16]; * of VM_FAULT_xxx flags that give details about how the fault was handled. * * pgoff should be used in favour of virtual_address, if possible. If pgoff - * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear - * mapping support. + * is used, one may implement ->remap_pages to get nonlinear mapping support. */ struct vm_fault { unsigned int flags; /* FAULT_FLAG_xxx flags */ @@ -230,6 +228,9 @@ struct vm_operations_struct { int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, const nodemask_t *to, unsigned long flags); #endif + /* called by sys_remap_file_pages() to populate non-linear mapping */ + int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff); }; struct mmu_gather; -- cgit v1.2.3 From e9714acf8c439688884234dcac2bfc38bb607d38 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:54 -0700 Subject: mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas Currently the kernel sets mm->exe_file during sys_execve() and then tracks number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon as this counter drops to zero kernel resets mm->exe_file to NULL. Plus it resets mm->exe_file at last mmput() when mm->mm_users drops to zero. VMA with VM_EXECUTABLE flag appears after mapping file with flag MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma splitting, because sys_mmap ignores this flag. Usually binfmt module sets mm->exe_file and mmaps executable vmas with this file, they hold mm->exe_file while task is running. comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"), where all this stuff was introduced: > The kernel implements readlink of /proc/pid/exe by getting the file from > the first executable VMA. Then the path to the file is reconstructed and > reported as the result. > > Because of the VMA walk the code is slightly different on nommu systems. > This patch avoids separate /proc/pid/exe code on nommu systems. Instead of > walking the VMAs to find the first executable file-backed VMA we store a > reference to the exec'd file in the mm_struct. > > That reference would prevent the filesystem holding the executable file > from being unmounted even after unmapping the VMAs. So we track the number > of VM_EXECUTABLE VMAs and drop the new reference when the last one is > unmapped. This avoids pinning the mounted filesystem. exe_file's vma accounting is hooked into every file mmap/unmmap and vma split/merge just to fix some hypothetical pinning fs from umounting by mm, which already unmapped all its executable files, but still alive. Seems like currently nobody depends on this behaviour. We can try to remove this logic and keep mm->exe_file until final mmput(). mm->exe_file is still protected with mm->mmap_sem, because we want to change it via new sys_prctl(PR_SET_MM_EXE_FILE). Also via this syscall task can change its mm->exe_file and unpin mountpoint explicitly. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 44d3fc25f556..25ef49c1f2bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -87,7 +87,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ -#define VM_EXECUTABLE 0x00001000 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ @@ -1396,9 +1395,6 @@ extern void exit_mmap(struct mm_struct *); extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ -extern void added_exe_file_vma(struct mm_struct *mm); -extern void removed_exe_file_vma(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); -- cgit v1.2.3 From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:59 -0700 Subject: mm: prepare VM_DONTDUMP for using in drivers Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags: VM_DONTEXPAND, VM_DONTCOPY. Currently this flag used only for sys_madvise. The next patch will use it for replacing the outdated flag VM_RESERVED. Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 25ef49c1f2bd..dc08d558e058 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -102,7 +102,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_NODUMP 0x04000000 /* Do not include in the core dump */ +#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -- cgit v1.2.3 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index dc08d558e058..0514fe9d3c84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -96,7 +96,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ @@ -148,7 +147,7 @@ extern unsigned int kobjsize(const void *objp); * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) /* * mapping from the currently active vm_flags protection bits (the -- cgit v1.2.3 From 1fb3f8ca0e9222535a39b884cb67a34628411b9f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:12 -0700 Subject: mm: compaction: capture a suitable high-order page immediately when it is made available While compaction is migrating pages to free up large contiguous blocks for allocation it races with other allocation requests that may steal these blocks or break them up. This patch alters direct compaction to capture a suitable free page as soon as it becomes available to reduce this race. It uses similar logic to split_free_page() to ensure that watermarks are still obeyed. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0514fe9d3c84..5ddb11b2b4bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -442,6 +442,7 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); int split_free_page(struct page *page); +int capture_free_page(struct page *page, int alloc_order, int migratetype); /* * Compound pages have a destructor function. Provide a -- cgit v1.2.3 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5ddb11b2b4bb..0f671ef09eba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -1355,22 +1354,27 @@ extern void zone_pcp_reset(struct zone *zone); extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); -/* prio_tree.c */ -void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); -void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter); - -#define vma_prio_tree_foreach(vma, iter, root, begin, end) \ - for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ - (vma = vma_prio_tree_next(vma, iter)); ) +/* interval_tree.c */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping); +void vma_interval_tree_insert(struct vm_area_struct *node, + struct rb_root *root); +void vma_interval_tree_remove(struct vm_area_struct *node, + struct rb_root *root); +struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, + unsigned long start, unsigned long last); +struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, + unsigned long start, unsigned long last); + +#define vma_interval_tree_foreach(vma, root, start, last) \ + for (vma = vma_interval_tree_iter_first(root, start, last); \ + vma; vma = vma_interval_tree_iter_next(vma, start, last)) static inline void vma_nonlinear_insert(struct vm_area_struct *vma, struct list_head *list) { - vma->shared.vm_set.parent = NULL; - list_add_tail(&vma->shared.vm_set.list, list); + list_add_tail(&vma->shared.nonlinear, list); } /* mmap.c */ -- cgit v1.2.3 From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:35 -0700 Subject: mm: interval tree updates Update the generic interval tree code that was introduced in "mm: replace vma prio_tree with an interval tree". Changes: - fixed 'endpoing' typo noticed by Andrew Morton - replaced include/linux/interval_tree_tmpl.h, which was used as a template (including it automatically defined the interval tree functions) with include/linux/interval_tree_generic.h, which only defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself defines the interval tree functions when invoked. Now that is a very long macro which is unfortunate, but it does make the usage sites (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously. - make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro, instead of duplicating that code in the interval tree template. - replaced vma_interval_tree_add(), which was actually handling the nonlinear and interval tree cases, with vma_interval_tree_insert_after() which handles only the interval tree case and has an API that is more consistent with the other interval tree handling functions. The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f671ef09eba..f1d9aaadb566 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1355,11 +1355,11 @@ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ -void vma_interval_tree_add(struct vm_area_struct *vma, - struct vm_area_struct *old, - struct address_space *mapping); void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root *root); +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, -- cgit v1.2.3 From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:39 -0700 Subject: mm anon rmap: replace same_anon_vma linked list with an interval tree. When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f1d9aaadb566..0cdab4e0f814 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -20,6 +20,7 @@ struct mempolicy; struct anon_vma; +struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; @@ -1377,6 +1378,19 @@ static inline void vma_nonlinear_insert(struct vm_area_struct *vma, list_add_tail(&vma->shared.nonlinear, list); } +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root); +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root); +struct anon_vma_chain *anon_vma_interval_tree_iter_first( + struct rb_root *root, unsigned long start, unsigned long last); +struct anon_vma_chain *anon_vma_interval_tree_iter_next( + struct anon_vma_chain *node, unsigned long start, unsigned long last); + +#define anon_vma_interval_tree_foreach(avc, root, start, last) \ + for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ + avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) + /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, -- cgit v1.2.3 From ed8ea8150182f8d715fceb3b175ef0a9ebacd872 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:45 -0700 Subject: mm: add CONFIG_DEBUG_VM_RB build option Add a CONFIG_DEBUG_VM_RB build option for the previously existing DEBUG_MM_RB code. Now that Andi Kleen modified it to avoid using recursive algorithms, we can expose it a bit more. Also extend this code to validate_mm() after stack expansion, and to check that the vma's start and last pgoffs have not changed since the nodes were inserted on the anon vma interval tree (as it is important that the nodes be reindexed after each such update). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cdab4e0f814..0e6f9c9f2123 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1386,6 +1386,9 @@ struct anon_vma_chain *anon_vma_interval_tree_iter_first( struct rb_root *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node); +#endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ -- cgit v1.2.3 From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:50 -0700 Subject: mm: avoid taking rmap locks in move_ptes() During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e6f9c9f2123..0d5f823ce3fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1060,7 +1060,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len); + unsigned long new_addr, unsigned long len, + bool need_rmap_locks); extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); @@ -1410,7 +1411,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff); + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); extern int mm_take_all_locks(struct mm_struct *mm); -- cgit v1.2.3 From b12c4ad14ee0232ad47c2bef404b6d42a3578332 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:08 -0700 Subject: mm: page_alloc: use get_freepage_migratetype() instead of page_private() The page allocator uses set_page_private and page_private for handling migratetype when it frees page. Let's replace them with [set|get] _freepage_migratetype to make it more clear. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d5f823ce3fc..4ed5c7367b9b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -237,6 +237,18 @@ struct inode; #define page_private(page) ((page)->private) #define set_page_private(page, v) ((page)->private = (v)) +/* It's valid only if the page is free path or free_list */ +static inline void set_freepage_migratetype(struct page *page, int migratetype) +{ + set_page_private(page, migratetype); +} + +/* It's valid only if the page is free path or free_list */ +static inline int get_freepage_migratetype(struct page *page) +{ + return page_private(page); +} + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) -- cgit v1.2.3 From 95e3441248053fc06bbb1dbbd34409a84211619e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:11 -0700 Subject: mm: remain migratetype in freed page The page allocator caches the pageblock information in page->private while it is in the PCP freelists but this is overwritten with the order of the page when freed to the buddy allocator. This patch stores the migratetype of the page in the page->index field so that it is available at all times when the page remain in free_list. This patch adds a new call site in __free_pages_ok so it might be overhead a bit but it's for high order allocation. So I believe damage isn't hurt. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ed5c7367b9b..b01e585ab4b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -240,13 +240,13 @@ struct inode; /* It's valid only if the page is free path or free_list */ static inline void set_freepage_migratetype(struct page *page, int migratetype) { - set_page_private(page, migratetype); + page->index = migratetype; } /* It's valid only if the page is free path or free_list */ static inline int get_freepage_migratetype(struct page *page) { - return page_private(page); + return page->index; } /* -- cgit v1.2.3 From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:19 -0700 Subject: readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index b01e585ab4b5..bcaab4e6fe91 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -161,6 +161,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x40 /* second try */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's -- cgit v1.2.3 From 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Mon, 8 Oct 2012 16:33:06 -0700 Subject: mm: fix-up zone present pages I think zone->present_pages indicates pages that buddy system can management, it should be: zone->present_pages = spanned pages - absent pages - bootmem pages, but is now: zone->present_pages = spanned pages - absent pages - memmap pages. spanned pages: total size, including holes. absent pages: holes. bootmem pages: pages used in system boot, managed by bootmem allocator. memmap pages: pages used by page structs. This may cause zone->present_pages less than it should be. For example, numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's present_pages should be spanned pages - absent pages, but now it also minus memmap pages(free_area_init_core), which are actually allocated from ZONE_MOVABLE. When offlining all memory of a zone, this will cause zone->present_pages less than 0, because present_pages is unsigned long type, it is actually a very large integer, it indirectly caused zone->watermark[WMARK_MIN] becomes a large integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a large integer(calculate_totalreserve_pages()), and finally cause memory allocating failure when fork process(__vm_enough_memory()). [root@localhost ~]# dmesg -bash: fork: Cannot allocate memory I think the bug described in http://marc.info/?l=linux-mm&m=134502182714186&w=2 is also caused by wrong zone present pages. This patch intends to fix-up zone->present_pages when memory are freed to buddy system on x86_64 and IA64 platforms. Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reported-by: Petr Tesarik Tested-by: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index bcaab4e6fe91..fa0680402738 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1684,5 +1684,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ +extern void reset_zone_present_pages(void); +extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, + unsigned long end_pfn); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3