diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 10 | ||||
-rw-r--r-- | mm/debug.c | 7 | ||||
-rw-r--r-- | mm/filemap.c | 220 | ||||
-rw-r--r-- | mm/hmm.c | 2 | ||||
-rw-r--r-- | mm/kasan/init.c | 10 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 5 | ||||
-rw-r--r-- | mm/memblock.c | 371 | ||||
-rw-r--r-- | mm/memory.c | 11 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 57 | ||||
-rw-r--r-- | mm/mempolicy.c | 40 | ||||
-rw-r--r-- | mm/migrate.c | 11 | ||||
-rw-r--r-- | mm/page_alloc.c | 12 | ||||
-rw-r--r-- | mm/page_ext.c | 2 | ||||
-rw-r--r-- | mm/page_isolation.c | 51 | ||||
-rw-r--r-- | mm/percpu.c | 84 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 3 | ||||
-rw-r--r-- | mm/slab_common.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 5 | ||||
-rw-r--r-- | mm/sparse.c | 29 |
20 files changed, 526 insertions, 408 deletions
@@ -327,16 +327,14 @@ int __init cma_declare_contiguous(phys_addr_t base, * memory in case of failure. */ if (base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range(size, alignment, - highmem_start, limit, - MEMBLOCK_NONE); + addr = memblock_phys_alloc_range(size, alignment, + highmem_start, limit); limit = highmem_start; } if (!addr) { - addr = memblock_alloc_range(size, alignment, base, - limit, - MEMBLOCK_NONE); + addr = memblock_phys_alloc_range(size, alignment, base, + limit); if (!addr) { ret = -ENOMEM; goto err; diff --git a/mm/debug.c b/mm/debug.c index 1611cf00a137..eee9c221280c 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -79,7 +79,7 @@ void __dump_page(struct page *page, const char *reason) pr_warn("ksm "); else if (mapping) { pr_warn("%ps ", mapping->a_ops); - if (mapping->host->i_dentry.first) { + if (mapping->host && mapping->host->i_dentry.first) { struct dentry *dentry; dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); pr_warn("name:\"%pd\" ", dentry); @@ -137,7 +137,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -168,7 +168,8 @@ void dump_mm(const struct mm_struct *mm) mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, + (u64)atomic64_read(&mm->pinned_vm), + mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, diff --git a/mm/filemap.c b/mm/filemap.c index a3b4021c448f..d78f577baef2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry); * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased * refcount. + * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do + * its own locking dance if the page is already in cache, or unlock the page + * before returning if we had to add the page to pagecache. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -1641,7 +1644,7 @@ no_page: if (!page) return NULL; - if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ @@ -1655,6 +1658,13 @@ no_page: if (err == -EEXIST) goto repeat; } + + /* + * add_to_page_cache_lru locks the page, and for mmap we expect + * an unlocked page. + */ + if (page && (fgp_flags & FGP_FOR_MMAP)) + unlock_page(page); } return page; @@ -2379,64 +2389,98 @@ out: EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU -/** - * page_cache_read - adds requested page to the page cache if not already there - * @file: file to read - * @offset: page index - * @gfp_mask: memory allocation flags - * - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - * - * Return: %0 on success, negative error code otherwise. - */ -static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) +#define MMAP_LOTSAMISS (100) +static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) { - struct address_space *mapping = file->f_mapping; - struct page *page; - int ret; + int flags = vmf->flags; - do { - page = __page_cache_alloc(gfp_mask); - if (!page) - return -ENOMEM; + if (fpin) + return fpin; - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); - if (ret == 0) - ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) - ret = 0; /* losing race to add is OK */ + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} - put_page(page); +/* + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * @vmf - the vm_fault for this fault. + * @page - the page to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * It differs in that it actually returns the page locked if it returns 1 and 0 + * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * will point to the pinned file and needs to be fput()'ed at a later point. + */ +static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, + struct file **fpin) +{ + if (trylock_page(page)) + return 1; - } while (ret == AOP_TRUNCATED_PAGE); + /* + * NOTE! This will make us return with VM_FAULT_RETRY, but with + * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT + * is supposed to work. We have way too many special cases.. + */ + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; - return ret; + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__lock_page_killable(page)) { + /* + * We didn't have the right flags to drop the mmap_sem, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_sem here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + up_read(&vmf->vma->vm_mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; } -#define MMAP_LOTSAMISS (100) /* - * Synchronous readahead happens when we don't even find - * a page in the page cache at all. + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. */ -static void do_sync_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - pgoff_t offset) +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) - return; + if (vmf->vma->vm_flags & VM_RAND_READ) + return fpin; if (!ra->ra_pages) - return; + return fpin; - if (vma->vm_flags & VM_SEQ_READ) { + if (vmf->vma->vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); - return; + return fpin; } /* Avoid banging the cache line if not needed */ @@ -2448,37 +2492,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, * stop bothering with read-ahead. It will only hurt. */ if (ra->mmap_miss > MMAP_LOTSAMISS) - return; + return fpin; /* * mmap read-around */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, offset - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); + return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, - * so we want to possibly extend the readahead further.. + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_sem in order to do IO. */ -static void do_async_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - struct page *page, - pgoff_t offset) +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) - return; + if (vmf->vma->vm_flags & VM_RAND_READ) + return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; - if (PageReadahead(page)) + if (PageReadahead(page)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); + } + return fpin; } /** @@ -2510,6 +2561,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2531,23 +2583,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf->vma, ra, file, page, offset); + fpin = do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); retry_find: - page = find_get_page(mapping, offset); - if (!page) - goto no_cached_page; + page = pagecache_get_page(mapping, offset, + FGP_CREAT|FGP_FOR_MMAP, + vmf->gfp_mask); + if (!page) { + if (fpin) + goto out_retry; + return vmf_error(-ENOMEM); + } } - if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { - put_page(page); - return ret | VM_FAULT_RETRY; - } + if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + goto out_retry; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { @@ -2565,6 +2620,16 @@ retry_find: goto page_not_uptodate; /* + * We've made it this far and we had to drop our mmap_sem, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + unlock_page(page); + goto out_retry; + } + + /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ @@ -2578,28 +2643,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -no_cached_page: - /* - * We're only likely to ever get here if MADV_RANDOM is in - * effect. - */ - error = page_cache_read(file, offset, vmf->gfp_mask); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return vmf_error(error); - page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. @@ -2608,12 +2651,15 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } + if (fpin) + goto out_retry; put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -2622,6 +2668,18 @@ page_not_uptodate: /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_sem, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (page) + put_page(page); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); @@ -990,7 +990,7 @@ static void hmm_devmem_ref_kill(struct percpu_ref *ref) percpu_ref_kill(ref); } -static int hmm_devmem_fault(struct vm_area_struct *vma, +static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, unsigned long addr, const struct page *page, unsigned int flags, diff --git a/mm/kasan/init.c b/mm/kasan/init.c index fcaa1ca03175..ce45c491ebcd 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -83,8 +83,14 @@ static inline bool kasan_early_shadow_page_entry(pte_t pte) static __init void *early_alloc(size_t size, int node) { - return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, node); + void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); + + if (!ptr) + panic("%s: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n", + __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS)); + + return ptr; } static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3e0c11f7d7a1..3ce956efa0cb 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -163,7 +163,10 @@ static inline u8 random_tag(void) #endif #ifndef arch_kasan_set_tag -#define arch_kasan_set_tag(addr, tag) ((void *)(addr)) +static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) +{ + return addr; +} #endif #ifndef arch_kasan_reset_tag #define arch_kasan_reset_tag(addr) ((void *)(addr)) diff --git a/mm/memblock.c b/mm/memblock.c index 470601115892..e7665cf914b1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -76,8 +76,19 @@ * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` * performs such an assignment directly. * - * Once memblock is setup the memory can be allocated using either - * memblock or bootmem APIs. + * Once memblock is setup the memory can be allocated using one of the + * API variants: + * + * * :c:func:`memblock_phys_alloc*` - these functions return the + * **physical** address of the allocated memory + * * :c:func:`memblock_alloc*` - these functions return the **virtual** + * address of the allocated memory. + * + * Note, that both API variants use implict assumptions about allowed + * memory ranges and the fallback methods. Consult the documentation + * of :c:func:`memblock_alloc_internal` and + * :c:func:`memblock_alloc_range_nid` functions for more elaboarte + * description. * * As the system boot progresses, the architecture specific * :c:func:`mem_init` function frees all the memory to the buddy page @@ -132,7 +143,7 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; -enum memblock_flags __init_memblock choose_memblock_flags(void) +static enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -261,7 +272,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, +static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, enum memblock_flags flags) @@ -435,17 +446,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, else in_slab = &memblock_reserved_in_slab; - /* Try to find some space for it. - * - * WARNING: We assume that either slab_is_available() and we use it or - * we use MEMBLOCK for allocations. That means that this is unsafe to - * use when bootmem is currently active (unless bootmem itself is - * implemented on top of MEMBLOCK which isn't the case yet) - * - * This should however not be an issue for now, as we currently only - * call into MEMBLOCK while it's still active, or much later when slab - * is active for memory hotplug operations - */ + /* Try to find some space for it */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; @@ -858,11 +859,14 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, if (ret) return ret; - for (i = start_rgn; i < end_rgn; i++) + for (i = start_rgn; i < end_rgn; i++) { + struct memblock_region *r = &type->regions[i]; + if (set) - memblock_set_region_flags(&type->regions[i], flag); + r->flags |= flag; else - memblock_clear_region_flags(&type->regions[i], flag); + r->flags &= ~flag; + } memblock_merge_regions(type); return 0; @@ -962,8 +966,31 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, *idx = ULLONG_MAX; } +static bool should_skip_region(struct memblock_region *m, int nid, int flags) +{ + int m_nid = memblock_get_region_node(m); + + /* only memory regions are associated with nodes, check it */ + if (nid != NUMA_NO_NODE && nid != m_nid) + return true; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + return true; + + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + return true; + + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + return true; + + return false; +} + /** - * __next__mem_range - next function for for_each_free_mem_range() etc. + * __next_mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes @@ -1009,20 +1036,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) - continue; - - /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) - continue; - - /* if we want mirror memory skip non-mirror memory regions */ - if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) - continue; - - /* skip nomap memory unless we were asked for it explicitly */ - if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + if (should_skip_region(m, nid, flags)) continue; if (!type_b) { @@ -1126,20 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) - continue; - - /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) - continue; - - /* if we want mirror memory skip non-mirror memory regions */ - if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) - continue; - - /* skip nomap memory unless we were asked for it explicitly */ - if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + if (should_skip_region(m, nid, flags)) continue; if (!type_b) { @@ -1255,94 +1256,123 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory the + * allocation falls back to any node in the system + * + * For systems with memory mirroring, the allocation is attempted first + * from the regions with mirroring enabled and then retried from any + * memory region. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for + * allocated boot memory block, so that it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, - enum memblock_flags flags) + phys_addr_t end, int nid) { + enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); align = SMP_CACHE_BYTES; } + if (end > memblock.current_limit) + end = memblock.current_limit; + +again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) { - /* - * The min_count is set to 0 so that memblock allocations are - * never reported as leaks. - */ - kmemleak_alloc_phys(found, size, 0, 0); - return found; - } - return 0; -} - -phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end, - enum memblock_flags flags) -{ - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, - flags); -} - -phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid, enum memblock_flags flags) -{ - return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); -} - -phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) -{ - enum memblock_flags flags = choose_memblock_flags(); - phys_addr_t ret; + if (found && !memblock_reserve(found, size)) + goto done; -again: - ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, - nid, flags); + if (nid != NUMA_NO_NODE) { + found = memblock_find_in_range_node(size, align, start, + end, NUMA_NO_NODE, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + } - if (!ret && (flags & MEMBLOCK_MIRROR)) { + if (flags & MEMBLOCK_MIRROR) { flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); goto again; } - return ret; -} - -phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) -{ - return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, - MEMBLOCK_NONE); -} -phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) -{ - phys_addr_t alloc; - - alloc = __memblock_alloc_base(size, align, max_addr); + return 0; - if (alloc == 0) - panic("ERROR: Failed to allocate %pa bytes below %pa.\n", - &size, &max_addr); +done: + /* Skip kmemleak for kasan_init() due to high volume. */ + if (end != MEMBLOCK_ALLOC_KASAN) + /* + * The min_count is set to 0 so that memblock allocated + * blocks are never reported as leaks. This is because many + * of these blocks are only referred via the physical + * address which is not looked up by kmemleak. + */ + kmemleak_alloc_phys(found, size, 0, 0); - return alloc; + return found; } -phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align) +/** + * memblock_phys_alloc_range - allocate a memory block inside specified range + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (physical address) + * @end: the upper bound of the memory region to allocate (physical address) + * + * Allocate @size bytes in the between @start and @end. + * + * Return: physical address of the allocated memory block on success, + * %0 on failure. + */ +phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, + phys_addr_t align, + phys_addr_t start, + phys_addr_t end) { - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); } +/** + * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Allocates memory block from the specified NUMA node. If the node + * has no available memory, attempts to allocated from any node in the + * system. + * + * Return: physical address of the allocated memory block on success, + * %0 on failure. + */ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - phys_addr_t res = memblock_phys_alloc_nid(size, align, nid); - - if (res) - return res; - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + return memblock_alloc_range_nid(size, align, 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); } /** @@ -1353,19 +1383,13 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * The @min_addr limit is dropped if it can not be satisfied and the allocation - * will fall back to memory below @min_addr. Also, allocation may fall back - * to any node in the system if the specified node can not - * hold the requested memory. + * Allocates memory block using memblock_alloc_range_nid() and + * converts the returned physical address to virtual. * - * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. - * - * The phys address of allocated boot memory block is converted to virtual and - * allocated memory is reset to 0. - * - * In addition, function sets the min_count to 0 using kmemleak_alloc for - * allocated boot memory block, so that it is never reported as leaks. + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Other constraints, such + * as node and mirrored memory will be handled again in + * memblock_alloc_range_nid(). * * Return: * Virtual address of allocated memory block on success, NULL on failure. @@ -1376,11 +1400,6 @@ static void * __init memblock_alloc_internal( int nid) { phys_addr_t alloc; - void *ptr; - enum memblock_flags flags = choose_memblock_flags(); - - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; /* * Detect any accidental use of these APIs after slab is ready, as at @@ -1390,54 +1409,16 @@ static void * __init memblock_alloc_internal( if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); - if (!align) { - dump_stack(); - align = SMP_CACHE_BYTES; - } + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); - if (max_addr > memblock.current_limit) - max_addr = memblock.current_limit; -again: - alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid, flags); - if (alloc && !memblock_reserve(alloc, size)) - goto done; + /* retry allocation without lower limit */ + if (!alloc && min_addr) + alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid); - if (nid != NUMA_NO_NODE) { - alloc = memblock_find_in_range_node(size, align, min_addr, - max_addr, NUMA_NO_NODE, - flags); - if (alloc && !memblock_reserve(alloc, size)) - goto done; - } - - if (min_addr) { - min_addr = 0; - goto again; - } + if (!alloc) + return NULL; - if (flags & MEMBLOCK_MIRROR) { - flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", - &size); - goto again; - } - - return NULL; -done: - ptr = phys_to_virt(alloc); - - /* Skip kmemleak for kasan_init() due to high volume. */ - if (max_addr != MEMBLOCK_ALLOC_KASAN) - /* - * The min_count is set to 0 so that bootmem allocated - * blocks are never reported as leaks. This is because many - * of these blocks are only referred via the physical - * address which is not looked up by kmemleak. - */ - kmemleak_alloc(ptr, size, 0, 0); - - return ptr; + return phys_to_virt(alloc); } /** @@ -1479,7 +1460,7 @@ void * __init memblock_alloc_try_nid_raw( } /** - * memblock_alloc_try_nid_nopanic - allocate boot memory block + * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation @@ -1495,42 +1476,6 @@ void * __init memblock_alloc_try_nid_raw( * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_alloc_try_nid_nopanic( - phys_addr_t size, phys_addr_t align, - phys_addr_t min_addr, phys_addr_t max_addr, - int nid) -{ - void *ptr; - - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", - __func__, (u64)size, (u64)align, nid, &min_addr, - &max_addr, (void *)_RET_IP_); - - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); - if (ptr) - memset(ptr, 0, size); - return ptr; -} - -/** - * memblock_alloc_try_nid - allocate boot memory block with panicking - * @size: size of memory block to be allocated in bytes - * @align: alignment of the region and block's size - * @min_addr: the lower bound of the memory region from where the allocation - * is preferred (phys address) - * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to - * allocate only from memory limited by memblock.current_limit value - * @nid: nid of the free area to find, %NUMA_NO_NODE for any node - * - * Public panicking version of memblock_alloc_try_nid_nopanic() - * which provides debug information (including caller info), if enabled, - * and panics if the request can not be satisfied. - * - * Return: - * Virtual address of allocated memory block on success, NULL on failure. - */ void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, @@ -1543,24 +1488,20 @@ void * __init memblock_alloc_try_nid( &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); - if (ptr) { + if (ptr) memset(ptr, 0, size); - return ptr; - } - panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n", - __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); - return NULL; + return ptr; } /** - * __memblock_free_late - free bootmem block pages directly to buddy allocator + * __memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * This is only useful when the bootmem allocator has already been torn + * This is only useful when the memblock allocator has already been torn * down, but we are still initializing the system. Pages are released directly - * to the buddy allocator, no bootmem metadata is updated because it is gone. + * to the buddy allocator. */ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) { diff --git a/mm/memory.c b/mm/memory.c index 47fe250307c7..ab650c21bccd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1549,10 +1549,12 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; } - entry = *pte; - goto out_mkwrite; - } else - goto out_unlock; + entry = pte_mkyoung(*pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, addr, pte, entry, 1)) + update_mmu_cache(vma, addr, pte); + } + goto out_unlock; } /* Ok, finally just insert the thing.. */ @@ -1561,7 +1563,6 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); -out_mkwrite: if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6b05576fb4ec..0082d699be94 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -96,27 +96,29 @@ void mem_hotplug_done(void) cpus_read_unlock(); } +u64 max_mem_size = U64_MAX; + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { - struct resource *res, *conflict; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - if (!res) - return ERR_PTR(-ENOMEM); - - res->name = "System RAM"; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - conflict = request_resource_conflict(&iomem_resource, res); - if (conflict) { - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_debug("Device unaddressable memory block " - "memory hotplug at %#010llx !\n", - (unsigned long long)start); - } - pr_debug("System RAM resource %pR cannot be added\n", res); - kfree(res); + struct resource *res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + char *resource_name = "System RAM"; + + if (start + size > max_mem_size) + return ERR_PTR(-E2BIG); + + /* + * Request ownership of the new memory range. This might be + * a child of an existing resource that was present but + * not marked as busy. + */ + res = __request_region(&iomem_resource, start, size, + resource_name, flags); + + if (!res) { + pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", + start, start + size); return ERR_PTR(-EEXIST); } return res; @@ -1574,7 +1576,7 @@ static int __ref __offline_pages(unsigned long start_pfn, { unsigned long pfn, nr_pages; long offlined_pages; - int ret, node; + int ret, node, nr_isolate_pageblock; unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; @@ -1600,10 +1602,11 @@ static int __ref __offline_pages(unsigned long start_pfn, ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, SKIP_HWPOISON | REPORT_FAILURE); - if (ret) { + if (ret < 0) { reason = "failure to isolate range"; goto failed_removal; } + nr_isolate_pageblock = ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1655,8 +1658,16 @@ static int __ref __offline_pages(unsigned long start_pfn, /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); - /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + + /* + * Onlining will reset pagetype flags and makes migrate type + * MOVABLE, so just need to decrease the number of isolated + * pageblocks zone counter here. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock -= nr_isolate_pageblock; + spin_unlock_irqrestore(&zone->lock, flags); + /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; @@ -1688,12 +1699,12 @@ static int __ref __offline_pages(unsigned long start_pfn, failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ mem_hotplug_done(); return ret; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index af171ccb56a2..2219e747df49 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -428,6 +428,13 @@ static inline bool queue_pages_required(struct page *page, return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } +/* + * queue_pages_pmd() has three possible return values: + * 1 - pages are placed on the right node or queued successfully. + * 0 - THP was split. + * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing + * page was already on a node that does not follow the policy. + */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -437,7 +444,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = 1; + ret = -EIO; goto unlock; } page = pmd_page(*pmd); @@ -454,8 +461,15 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = 1; flags = qp->flags; /* go to thp migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(walk->vma)) { + ret = -EIO; + goto unlock; + } + migrate_page_add(page, qp->pagelist, flags); + } else + ret = -EIO; unlock: spin_unlock(ptl); out: @@ -480,8 +494,10 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { ret = queue_pages_pmd(pmd, ptl, addr, end, walk); - if (ret) + if (ret > 0) return 0; + else if (ret < 0) + return ret; } if (pmd_trans_unstable(pmd)) @@ -502,11 +518,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_pages_required(page, qp)) continue; - migrate_page_add(page, qp->pagelist, flags); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(vma)) + break; + migrate_page_add(page, qp->pagelist, flags); + } else + break; } pte_unmap_unlock(pte - 1, ptl); cond_resched(); - return 0; + return addr != end ? -EIO : 0; } static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, @@ -576,7 +597,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (!vma_migratable(vma)) + /* + * Need check MPOL_MF_STRICT to return -EIO if possible + * regardless of vma_migratable + */ + if (!vma_migratable(vma) && + !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) @@ -603,7 +629,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } /* queue pages from current vma */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & MPOL_MF_VALID) return 0; return 1; } diff --git a/mm/migrate.c b/mm/migrate.c index ac6f4939bb59..663a5449367a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,10 +248,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = swp_entry_to_pte(entry); } else if (is_device_public_page(new)) { pte = pte_mkdevmap(pte); - flush_dcache_page(new); } - } else - flush_dcache_page(new); + } #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { @@ -995,6 +993,13 @@ static int move_to_new_page(struct page *newpage, struct page *page, */ if (!PageMappingFlags(page)) page->mapping = NULL; + + if (unlikely(is_zone_device_page(newpage))) { + if (is_device_public_page(newpage)) + flush_dcache_page(newpage); + } else + flush_dcache_page(newpage); + } out: return rc; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3eb01dedfb50..d96ca5bc555b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6445,8 +6445,8 @@ static void __ref setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = NULL; if (usemapsize) { zone->pageblock_flags = - memblock_alloc_node_nopanic(usemapsize, - pgdat->node_id); + memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, + pgdat->node_id); if (!zone->pageblock_flags) panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", usemapsize, zone->name, pgdat->node_id); @@ -6679,7 +6679,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node(size, SMP_CACHE_BYTES, + pgdat->node_id); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); @@ -7959,8 +7960,7 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_alloc_nopanic(size, - SMP_CACHE_BYTES); + table = memblock_alloc(size, SMP_CACHE_BYTES); else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); @@ -8233,7 +8233,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype, 0); - if (ret) + if (ret < 0) return ret; /* diff --git a/mm/page_ext.c b/mm/page_ext.c index ab4244920e0f..d8f1aca4ad43 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -161,7 +161,7 @@ static int __init alloc_node_page_ext(int nid) table_size = get_entry_size() * nr_pages; - base = memblock_alloc_try_nid_nopanic( + base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index ce323e56b34d..019280712e1b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -59,7 +59,8 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags)) + if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, + isol_flags)) ret = 0; /* @@ -160,27 +161,36 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) return NULL; } -/* - * start_isolate_page_range() -- make page-allocation-type of range of pages - * to be MIGRATE_ISOLATE. - * @start_pfn: The lower PFN of the range to be isolated. - * @end_pfn: The upper PFN of the range to be isolated. - * @migratetype: migrate type to set in error recovery. +/** + * start_isolate_page_range() - make page-allocation-type of range of pages to + * be MIGRATE_ISOLATE. + * @start_pfn: The lower PFN of the range to be isolated. + * @end_pfn: The upper PFN of the range to be isolated. + * start_pfn/end_pfn must be aligned to pageblock_order. + * @migratetype: Migrate type to set in error recovery. + * @flags: The following flags are allowed (they can be combined in + * a bit mask) + * SKIP_HWPOISON - ignore hwpoison pages + * REPORT_FAILURE - report details about the failure to + * isolate the range * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the - * future will not be allocated again. - * - * start_pfn/end_pfn must be aligned to pageblock_order. - * Return 0 on success and -EBUSY if any part of range cannot be isolated. + * future will not be allocated again. If specified range includes migrate types + * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all + * pages in the range finally, the caller have to free all pages in the range. + * test_page_isolated() can be used for test it. * * There is no high level synchronization mechanism that prevents two threads - * from trying to isolate overlapping ranges. If this happens, one thread + * from trying to isolate overlapping ranges. If this happens, one thread * will notice pageblocks in the overlapping range already set to isolate. * This happens in set_migratetype_isolate, and set_migratetype_isolate - * returns an error. We then clean up by restoring the migration type on - * pageblocks we may have modified and return -EBUSY to caller. This + * returns an error. We then clean up by restoring the migration type on + * pageblocks we may have modified and return -EBUSY to caller. This * prevents two threads from simultaneously working on overlapping ranges. + * + * Return: the number of isolated pageblocks on success and -EBUSY if any part + * of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, int flags) @@ -188,6 +198,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; unsigned long undo_pfn; struct page *page; + int nr_isolate_pageblock = 0; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); @@ -196,13 +207,15 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && - set_migratetype_isolate(page, migratetype, flags)) { - undo_pfn = pfn; - goto undo; + if (page) { + if (set_migratetype_isolate(page, migratetype, flags)) { + undo_pfn = pfn; + goto undo; + } + nr_isolate_pageblock++; } } - return 0; + return nr_isolate_pageblock; undo: for (pfn = start_pfn; pfn < undo_pfn; diff --git a/mm/percpu.c b/mm/percpu.c index c5c750781628..2e6fc8d552c9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1086,6 +1086,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, struct pcpu_chunk *chunk; unsigned long aligned_addr, lcm_align; int start_offset, offset_bits, region_size, region_bits; + size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; @@ -1101,9 +1102,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - SMP_CACHE_BYTES); + alloc_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT); + chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1118,25 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), - SMP_CACHE_BYTES); - chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), - SMP_CACHE_BYTES); - chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), - SMP_CACHE_BYTES); + alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); + chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->alloc_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = + BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); + chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->bound_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); + chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->md_blocks) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1888,7 +1905,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); + ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2044,6 +2061,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int group, unit, i; int map_size; unsigned long tmp_addr; + size_t alloc_size; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -2075,14 +2093,29 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), - SMP_CACHE_BYTES); - group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), - SMP_CACHE_BYTES); - unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), - SMP_CACHE_BYTES); - unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), - SMP_CACHE_BYTES); + alloc_size = ai->nr_groups * sizeof(group_offsets[0]); + group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_offsets) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = ai->nr_groups * sizeof(group_sizes[0]); + group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_sizes) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = nr_cpu_ids * sizeof(unit_map[0]); + unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!unit_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = nr_cpu_ids * sizeof(unit_off[0]); + unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!unit_off) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2148,6 +2181,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), SMP_CACHE_BYTES); + if (!pcpu_slot) + panic("%s: Failed to allocate %zu bytes\n", __func__, + pcpu_nr_slots * sizeof(pcpu_slot[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2460,7 +2496,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES); + areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2602,6 +2638,9 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); + if (!pages) + panic("%s: Failed to allocate %zu bytes\n", __func__, + pages_size); /* allocate pages */ j = 0; @@ -2690,8 +2729,7 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return memblock_alloc_from_nopanic( - size, align, __pa(MAX_DMA_ADDRESS)); + return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) @@ -2739,9 +2777,7 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = memblock_alloc_from_nopanic(unit_size, - PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); + fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ diff --git a/mm/slab.c b/mm/slab.c index 28652e4218e0..329bfe67f2ca 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2115,6 +2115,8 @@ done: cachep->allocflags = __GFP_COMP; if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; + if (flags & SLAB_CACHE_DMA32) + cachep->allocflags |= GFP_DMA32; if (flags & SLAB_RECLAIM_ACCOUNT) cachep->allocflags |= __GFP_RECLAIMABLE; cachep->size = size; diff --git a/mm/slab.h b/mm/slab.h index e5e6658eeacc..43ac818b8592 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, /* Legal flag mask for kmem_cache_create(), for various configurations */ -#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBUG_SLAB) diff --git a/mm/slab_common.c b/mm/slab_common.c index 03eeb8b7b4b1..58251ba63e4a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -53,7 +53,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ - SLAB_ACCOUNT) + SLAB_CACHE_DMA32 | SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slub.c b/mm/slub.c index 1b08fbcb7e61..d30ede89f4a6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3589,6 +3589,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; + if (s->flags & SLAB_CACHE_DMA32) + s->allocflags |= GFP_DMA32; + if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; @@ -5679,6 +5682,8 @@ static char *create_unique_id(struct kmem_cache *s) */ if (s->flags & SLAB_CACHE_DMA) *p++ = 'd'; + if (s->flags & SLAB_CACHE_DMA32) + *p++ = 'D'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; if (s->flags & SLAB_CONSISTENCY_CHECKS) diff --git a/mm/sparse.c b/mm/sparse.c index 77a0554fa5bd..56e057c432f9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,11 +65,15 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) + if (slab_is_available()) { section = kzalloc_node(array_size, GFP_KERNEL, nid); - else + } else { section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, nid); + if (!section) + panic("%s: Failed to allocate %lu bytes nid=%d\n", + __func__, array_size, nid); + } return section; } @@ -218,6 +222,9 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_alloc(size, align); + if (!mem_section) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, align); } #endif @@ -323,9 +330,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_alloc_try_nid_nopanic(size, - SMP_CACHE_BYTES, goal, limit, - nid); + p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); if (!p && limit) { limit = 0; goto again; @@ -379,7 +384,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -404,13 +409,18 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, { unsigned long size = section_map_size(); struct page *map = sparse_buffer_alloc(size); + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); if (map) return map; map = memblock_alloc_try_nid(size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + PAGE_SIZE, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!map) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", + __func__, size, PAGE_SIZE, nid, &addr); + return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -420,10 +430,11 @@ static void *sparsemap_buf_end __meminitdata; static void __init sparse_buffer_init(unsigned long size, int nid) { + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = memblock_alloc_try_nid_raw(size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS), + addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } @@ -556,7 +567,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MEMORY_HOTREMOVE -/* Mark all memory sections within the pfn range as online */ +/* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; |