diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8f1b15213f61..4d5bb67dc4ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -433,9 +433,8 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice); -int madvise_collapse(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end); +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct vm_area_struct *next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -596,8 +595,8 @@ static inline int hugepage_madvise(struct vm_area_struct *vma, } static inline int madvise_collapse(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) + unsigned long start, + unsigned long end, bool *lock_dropped) { return -EINVAL; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3495a20cef5e..1aa7ca67c756 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2727,8 +2727,8 @@ static int madvise_collapse_errno(enum scan_result r) } } -int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end) +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; @@ -2739,8 +2739,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; @@ -2788,7 +2786,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, &mmap_locked, cc); } if (!mmap_locked) - *prev = NULL; /* Tell caller we dropped mmap_lock */ + *lock_dropped = true; handle_result: switch (result) { @@ -2798,7 +2796,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); - BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); diff --git a/mm/madvise.c b/mm/madvise.c index f04b8165e2ab..c467ee42596f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -75,7 +75,9 @@ struct madvise_behavior { */ struct madvise_behavior_range range; /* The VMA and VMA preceding it (if applicable) currently targeted. */ - struct vm_area_struct *prev, *vma; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + bool lock_dropped; }; #ifdef CONFIG_ANON_VMA_NAME @@ -188,10 +190,8 @@ static int madvise_update_vma(vm_flags_t new_flags, struct anon_vma_name *anon_name = madv_behavior->anon_name; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); - if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { - madv_behavior->prev = vma; + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) return 0; - } vma = vma_modify_flags_name(&vmi, madv_behavior->prev, vma, range->start, range->end, new_flags, anon_name); @@ -199,7 +199,6 @@ static int madvise_update_vma(vm_flags_t new_flags, return PTR_ERR(vma); madv_behavior->vma = vma; - madv_behavior->prev = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); @@ -301,6 +300,12 @@ static void shmem_swapin_range(struct vm_area_struct *vma, } #endif /* CONFIG_SWAP */ +static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) +{ + VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); + madv_behavior->lock_dropped = true; +} + /* * Schedule all required I/O operations. Do not wait for completion. */ @@ -313,7 +318,6 @@ static long madvise_willneed(struct madvise_behavior *madv_behavior) unsigned long end = madv_behavior->range.end; loff_t offset; - madv_behavior->prev = vma; #ifdef CONFIG_SWAP if (!file) { walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); @@ -342,7 +346,7 @@ static long madvise_willneed(struct madvise_behavior *madv_behavior) * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ - madv_behavior->prev = NULL; /* tell sys_madvise we drop mmap_lock */ + mark_mmap_lock_dropped(madv_behavior); get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); @@ -633,7 +637,6 @@ static long madvise_cold(struct madvise_behavior *madv_behavior) struct vm_area_struct *vma = madv_behavior->vma; struct mmu_gather tlb; - madv_behavior->prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -665,7 +668,6 @@ static long madvise_pageout(struct madvise_behavior *madv_behavior) struct mmu_gather tlb; struct vm_area_struct *vma = madv_behavior->vma; - madv_behavior->prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -954,7 +956,6 @@ static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; int behavior = madv_behavior->behavior; - madv_behavior->prev = madv_behavior->vma; if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; @@ -964,8 +965,7 @@ static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { struct vm_area_struct *vma; - madv_behavior->prev = NULL; /* mmap_lock has been dropped, prev is stale */ - + mark_mmap_lock_dropped(madv_behavior); mmap_read_lock(mm); madv_behavior->vma = vma = vma_lookup(mm, range->start); if (!vma) @@ -1064,7 +1064,7 @@ static long madvise_remove(struct madvise_behavior *madv_behavior) unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; - madv_behavior->prev = NULL; /* tell sys_madvise we drop mmap_lock */ + mark_mmap_lock_dropped(madv_behavior); if (vma->vm_flags & VM_LOCKED) return -EINVAL; @@ -1183,7 +1183,6 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) long err; int i; - madv_behavior->prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; @@ -1293,7 +1292,6 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior) struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; - madv_behavior->prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. @@ -1336,8 +1334,8 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(madv_behavior); case MADV_COLLAPSE: - return madvise_collapse(vma, &madv_behavior->prev, - range->start, range->end); + return madvise_collapse(vma, range->start, range->end, + &madv_behavior->lock_dropped); case MADV_GUARD_INSTALL: return madvise_guard_install(madv_behavior); case MADV_GUARD_REMOVE: @@ -1589,7 +1587,6 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) vma_end_read(vma); goto take_mmap_read_lock; } - madv_behavior->prev = vma; /* Not currently required. */ madv_behavior->vma = vma; return true; @@ -1617,7 +1614,7 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) unsigned long last_end = range->end; int unmapped_error = 0; int error; - struct vm_area_struct *vma; + struct vm_area_struct *prev, *vma; /* * If VMA read lock is supported, apply madvise to a single VMA @@ -1630,24 +1627,23 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) return error; } - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(mm, range->start, &madv_behavior->prev); + vma = find_vma_prev(mm, range->start, &prev); if (vma && range->start > vma->vm_start) - madv_behavior->prev = vma; + prev = vma; for (;;) { - struct vm_area_struct *prev; - /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (last_end|vma->vm_end). */ if (range->start < vma->vm_start) { + /* + * This indicates a gap between VMAs in the input + * range. This does not cause the operation to abort, + * rather we simply return -ENOMEM to indicate that this + * has happened, but carry on. + */ unmapped_error = -ENOMEM; range->start = vma->vm_start; if (range->start >= last_end) @@ -1658,21 +1654,28 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) range->end = min(vma->vm_end, last_end); /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ + madv_behavior->prev = prev; madv_behavior->vma = vma; error = madvise_vma_behavior(madv_behavior); if (error) return error; - prev = madv_behavior->prev; + if (madv_behavior->lock_dropped) { + /* We dropped the mmap lock, we can't ref the VMA. */ + prev = NULL; + vma = NULL; + madv_behavior->lock_dropped = false; + } else { + vma = madv_behavior->vma; + prev = vma; + } - range->start = range->end; - if (prev && range->start < prev->vm_end) - range->start = prev->vm_end; - if (range->start >= last_end) + if (vma && range->end < vma->vm_end) + range->end = vma->vm_end; + if (range->end >= last_end) break; - if (prev) - vma = find_vma(mm, prev->vm_end); - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, range->start); + + vma = find_vma(mm, vma ? vma->vm_end : range->end); + range->start = range->end; } return unmapped_error;