From c7277090927a5e71871e799a355ed2940f6c8fc6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 2 Dec 2013 11:24:19 +0000 Subject: security: shmem: implement kernel private shmem inodes We have a problem where the big_key key storage implementation uses a shmem backed inode to hold the key contents. Because of this detail of implementation LSM checks are being done between processes trying to read the keys and the tmpfs backed inode. The LSM checks are already being handled on the key interface level and should not be enforced at the inode level (since the inode is an implementation detail, not a part of the security model) This patch implements a new function shmem_kernel_file_setup() which returns the equivalent to shmem_file_setup() only the underlying inode has S_PRIVATE set. This means that all LSM checks for the inode in question are skipped. It should only be used for kernel internal operations where the inode is not exposed to userspace without proper LSM checking. It is possible that some other users of shmem_file_setup() should use the new interface, but this has not been explored. Reproducing this bug is a little bit difficult. The steps I used on Fedora are: (1) Turn off selinux enforcing: setenforce 0 (2) Create a huge key k=`dd if=/dev/zero bs=8192 count=1 | keyctl padd big_key test-key @s` (3) Access the key in another context: runcon system_u:system_r:httpd_t:s0-s0:c0.c1023 keyctl print $k >/dev/null (4) Examine the audit logs: ausearch -m AVC -i --subject httpd_t | audit2allow If the last command's output includes a line that looks like: allow httpd_t user_tmpfs_t:file { open read }; There was an inode check between httpd and the tmpfs filesystem. With this patch no such denial will be seen. (NOTE! you should clear your audit log if you have tested for this previously) (Please return you box to enforcing) Signed-off-by: Eric Paris Signed-off-by: David Howells cc: Hugh Dickins cc: linux-mm@kvack.org --- mm/shmem.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 8297623fcaed..902a14842b74 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2918,13 +2918,8 @@ static struct dentry_operations anon_ops = { .d_dname = simple_dname }; -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc//maps - * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size - */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +static struct file *__shmem_file_setup(const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) { struct file *res; struct inode *inode; @@ -2957,6 +2952,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (!inode) goto put_dentry; + inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ @@ -2977,6 +2973,32 @@ put_memory: shmem_unacct_size(flags, size); return res; } + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The one user is the big_key implementation. LSM checks + * are provided at the key level rather than the inode level. + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, 0); +} EXPORT_SYMBOL_GPL(shmem_file_setup); /** -- cgit v1.2.3 From a0d8b00a3381f9d75764b3377590451cb0b4fe41 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Dec 2013 17:12:20 -0800 Subject: mm: memcg: do not declare OOM from __GFP_NOFAIL allocations Commit 84235de394d9 ("fs: buffer: move allocation failure loop into the allocator") started recognizing __GFP_NOFAIL in memory cgroups but forgot to disable the OOM killer. Any task that does not fail allocation will also not enter the OOM completion path. So don't declare an OOM state in this case or it'll be leaked and the task be able to bypass the limit until the next userspace-triggered page fault cleans up the OOM state. Reported-by: William Dauchy Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: [3.12.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f1a0ae6e11b8..e3aff0175d4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2696,6 +2696,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(task_in_memcg_oom(current))) goto bypass; + if (gfp_mask & __GFP_NOFAIL) + oom = false; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the -- cgit v1.2.3 From 3592806cfa08b7cca968f793c33f8e9460bab395 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 12 Dec 2013 17:12:33 -0800 Subject: thp: move preallocated PTE page table on move_huge_pmd() Andrey Wagin reported crash on VM_BUG_ON() in pgtable_pmd_page_dtor() with fallowing backtrace: free_pgd_range+0x2bf/0x410 free_pgtables+0xce/0x120 unmap_region+0xe0/0x120 do_munmap+0x249/0x360 move_vma+0x144/0x270 SyS_mremap+0x3b9/0x510 system_call_fastpath+0x16/0x1b The crash can be reproduce with this test case: #define _GNU_SOURCE #include #include #include #define MB (1024 * 1024UL) #define GB (1024 * MB) int main(int argc, char **argv) { char *p; int i; p = mmap((void *) GB, 10 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); for (i = 0; i < 10 * MB; i += 4096) p[i] = 1; mremap(p, 10 * MB, 10 * MB, MREMAP_FIXED | MREMAP_MAYMOVE, 2 * GB); return 0; } Due to split PMD lock, we now store preallocated PTE tables for THP pages per-PMD table. It means we need to move them to other PMD table if huge PMD moved there. Signed-off-by: Kirill A. Shutemov Reported-by: Andrey Vagin Tested-by: Andrey Vagin Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bccd5a628ea6..33a5dc492810 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) + if (new_ptl != old_ptl) { + pgtable_t pgtable; + + /* + * Move preallocated PTE page table if new_pmd is on + * different PMD page table. + */ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + spin_unlock(new_ptl); + } spin_unlock(old_ptl); } out: -- cgit v1.2.3 From 96f1c58d853497a757463e0b57fed140d6858f3a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Dec 2013 17:12:34 -0800 Subject: mm: memcg: fix race condition between memcg teardown and swapin There is a race condition between a memcg being torn down and a swapin triggered from a different memcg of a page that was recorded to belong to the exiting memcg on swapout (with CONFIG_MEMCG_SWAP extension). The result is unreclaimable pages pointing to dead memcgs, which can lead to anything from endless loops in later memcg teardown (the page is charged to all hierarchical parents but is not on any LRU list) or crashes from following the dangling memcg pointer. Memcgs with tasks in them can not be torn down and usually charges don't show up in memcgs without tasks. Swapin with the CONFIG_MEMCG_SWAP extension is the notable exception because it charges the cgroup that was recorded as owner during swapout, which may be empty and in the process of being torn down when a task in another memcg triggers the swapin: teardown: swapin: lookup_swap_cgroup_id() rcu_read_lock() mem_cgroup_lookup() css_tryget() rcu_read_unlock() disable css_tryget() call_rcu() offline_css() reparent_charges() res_counter_charge() (hierarchical!) css_put() css_free() pc->mem_cgroup = dead memcg add page to dead lru Add a final reparenting step into css_free() to make sure any such raced charges are moved out of the memcg before it's finally freed. In the longer term it would be cleaner to have the css_tryget() and the res_counter charge under the same RCU lock section so that the charge reparenting is deferred until the last charge whose tryget succeeded is visible. But this will require more invasive changes that will be harder to evaluate and backport into stable, so better defer them to a separate change set. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3aff0175d4c..f6a63f5b3827 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6355,6 +6355,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * XXX: css_offline() would be where we should reparent all + * memory to prepare the cgroup for destruction. However, + * memcg does not do css_tryget() and res_counter charging + * under the same RCU lock region, which means that charging + * could race with offlining. Offlining only happens to + * cgroups with no tasks in them but charges can show up + * without any tasks from the swapin path when the target + * memcg is looked up from the swapout record and not from the + * current task as it usually is. A race like this can leak + * charges and put pages with stale cgroup pointers into + * circulation: + * + * #0 #1 + * lookup_swap_cgroup_id() + * rcu_read_lock() + * mem_cgroup_lookup() + * css_tryget() + * rcu_read_unlock() + * disable css_tryget() + * call_rcu() + * offline_css() + * reparent_charges() + * res_counter_charge() + * css_put() + * css_free() + * pc->mem_cgroup = dead memcg + * add page to lru + * + * The bulk of the charges are still moved in offline_css() to + * avoid pinning a lot of pages in case a long-term reference + * like a swapout record is deferring the css_free() to long + * after offlining. But this makes sure we catch any charges + * made after offlining: + */ + mem_cgroup_reparent_charges(memcg); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); -- cgit v1.2.3 From 1f14c1ac19aa45118054b6d5425873c5c7fc23a1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Dec 2013 17:12:35 -0800 Subject: mm: memcg: do not allow task about to OOM kill to bypass the limit Commit 4942642080ea ("mm: memcg: handle non-error OOM situations more gracefully") allowed tasks that already entered a memcg OOM condition to bypass the memcg limit on subsequent allocation attempts hoping this would expedite finishing the page fault and executing the kill. David Rientjes is worried that this breaks memcg isolation guarantees and since there is no evidence that the bypass actually speeds up fault processing just change it so that these subsequent charge attempts fail outright. The notable exception being __GFP_NOFAIL charges which are required to bypass the limit regardless. Signed-off-by: Johannes Weiner Reported-by: David Rientjes Acked-by: Michal Hocko Acked-bt: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6a63f5b3827..bf5e89457149 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2694,7 +2694,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto bypass; if (unlikely(task_in_memcg_oom(current))) - goto bypass; + goto nomem; if (gfp_mask & __GFP_NOFAIL) oom = false; -- cgit v1.2.3