Skip to content

Commit 5a6fe12

Browse files
gormanmtorvalds
authored andcommittedFeb 10, 2009
Do not account for the address space used by hugetlbfs using VM_ACCOUNT
When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744a, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 4c098bc commit 5a6fe12

File tree

8 files changed

+65
-43
lines changed

8 files changed

+65
-43
lines changed
 

‎fs/hugetlbfs/inode.c

+5-3
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
108108

109109
if (hugetlb_reserve_pages(inode,
110110
vma->vm_pgoff >> huge_page_order(h),
111-
len >> huge_page_shift(h), vma))
111+
len >> huge_page_shift(h), vma,
112+
vma->vm_flags))
112113
goto out;
113114

114115
ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
947948
can_do_mlock());
948949
}
949950

950-
struct file *hugetlb_file_setup(const char *name, size_t size)
951+
struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951952
{
952953
int error = -ENOMEM;
953954
struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
981982

982983
error = -ENOMEM;
983984
if (hugetlb_reserve_pages(inode, 0,
984-
size >> huge_page_shift(hstate_inode(inode)), NULL))
985+
size >> huge_page_shift(hstate_inode(inode)), NULL,
986+
acctflag))
985987
goto out_inode;
986988

987989
d_instantiate(dentry, inode);

‎include/linux/hugetlb.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void);
3333
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3434
unsigned long address, int write_access);
3535
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
36-
struct vm_area_struct *vma);
36+
struct vm_area_struct *vma,
37+
int acctflags);
3738
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
3839

3940
extern unsigned long hugepages_treat_as_movable;
@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
138139

139140
extern const struct file_operations hugetlbfs_file_operations;
140141
extern struct vm_operations_struct hugetlb_vm_ops;
141-
struct file *hugetlb_file_setup(const char *name, size_t);
142+
struct file *hugetlb_file_setup(const char *name, size_t, int);
142143
int hugetlb_get_quota(struct address_space *mapping, long delta);
143144
void hugetlb_put_quota(struct address_space *mapping, long delta);
144145

‎include/linux/mm.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
11291129
unsigned long flag, unsigned long pgoff);
11301130
extern unsigned long mmap_region(struct file *file, unsigned long addr,
11311131
unsigned long len, unsigned long flags,
1132-
unsigned int vm_flags, unsigned long pgoff,
1133-
int accountable);
1132+
unsigned int vm_flags, unsigned long pgoff);
11341133

11351134
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
11361135
unsigned long len, unsigned long prot,

‎ipc/shm.c

+5-3
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
340340
struct file * file;
341341
char name[13];
342342
int id;
343+
int acctflag = 0;
343344

344345
if (size < SHMMIN || size > ns->shm_ctlmax)
345346
return -EINVAL;
@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
364365

365366
sprintf (name, "SYSV%08x", key);
366367
if (shmflg & SHM_HUGETLB) {
367-
/* hugetlb_file_setup takes care of mlock user accounting */
368-
file = hugetlb_file_setup(name, size);
368+
/* hugetlb_file_setup applies strict accounting */
369+
if (shmflg & SHM_NORESERVE)
370+
acctflag = VM_NORESERVE;
371+
file = hugetlb_file_setup(name, size, acctflag);
369372
shp->mlock_user = current_user();
370373
} else {
371-
int acctflag = 0;
372374
/*
373375
* Do not allow no accounting for OVERCOMMIT_NEVER, even
374376
* if it's asked for.

‎mm/fremap.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
198198
flags &= MAP_NONBLOCK;
199199
get_file(file);
200200
addr = mmap_region(file, start, size,
201-
flags, vma->vm_flags, pgoff, 1);
201+
flags, vma->vm_flags, pgoff);
202202
fput(file);
203203
if (IS_ERR_VALUE(addr)) {
204204
err = addr;

‎mm/hugetlb.c

+25-14
Original file line numberDiff line numberDiff line change
@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
22692269

22702270
int hugetlb_reserve_pages(struct inode *inode,
22712271
long from, long to,
2272-
struct vm_area_struct *vma)
2272+
struct vm_area_struct *vma,
2273+
int acctflag)
22732274
{
2274-
long ret, chg;
2275+
long ret = 0, chg;
22752276
struct hstate *h = hstate_inode(inode);
22762277

2277-
if (vma && vma->vm_flags & VM_NORESERVE)
2278-
return 0;
2279-
22802278
/*
22812279
* Shared mappings base their reservation on the number of pages that
22822280
* are already allocated on behalf of the file. Private mappings need
@@ -2285,29 +2283,42 @@ int hugetlb_reserve_pages(struct inode *inode,
22852283
*/
22862284
if (!vma || vma->vm_flags & VM_SHARED)
22872285
chg = region_chg(&inode->i_mapping->private_list, from, to);
2288-
else {
2289-
struct resv_map *resv_map = resv_map_alloc();
2290-
if (!resv_map)
2291-
return -ENOMEM;
2292-
2286+
else
22932287
chg = to - from;
22942288

2295-
set_vma_resv_map(vma, resv_map);
2296-
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2297-
}
2298-
22992289
if (chg < 0)
23002290
return chg;
23012291

23022292
if (hugetlb_get_quota(inode->i_mapping, chg))
23032293
return -ENOSPC;
2294+
2295+
/*
2296+
* Only apply hugepage reservation if asked. We still have to
2297+
* take the filesystem quota because it is an upper limit
2298+
* defined for the mount and not necessarily memory as a whole
2299+
*/
2300+
if (acctflag & VM_NORESERVE) {
2301+
reset_vma_resv_huge_pages(vma);
2302+
return 0;
2303+
}
2304+
23042305
ret = hugetlb_acct_memory(h, chg);
23052306
if (ret < 0) {
23062307
hugetlb_put_quota(inode->i_mapping, chg);
23072308
return ret;
23082309
}
23092310
if (!vma || vma->vm_flags & VM_SHARED)
23102311
region_add(&inode->i_mapping->private_list, from, to);
2312+
else {
2313+
struct resv_map *resv_map = resv_map_alloc();
2314+
2315+
if (!resv_map)
2316+
return -ENOMEM;
2317+
2318+
set_vma_resv_map(vma, resv_map);
2319+
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2320+
}
2321+
23112322
return 0;
23122323
}
23132324

‎mm/mmap.c

+22-16
Original file line numberDiff line numberDiff line change
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
918918
struct inode *inode;
919919
unsigned int vm_flags;
920920
int error;
921-
int accountable = 1;
922921
unsigned long reqprot = prot;
923922

924923
/*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
10191018
return -EPERM;
10201019
vm_flags &= ~VM_MAYEXEC;
10211020
}
1022-
if (is_file_hugepages(file))
1023-
accountable = 0;
10241021

10251022
if (!file->f_op || !file->f_op->mmap)
10261023
return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
10531050
if (error)
10541051
return error;
10551052

1056-
return mmap_region(file, addr, len, flags, vm_flags, pgoff,
1057-
accountable);
1053+
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
10581054
}
10591055
EXPORT_SYMBOL(do_mmap_pgoff);
10601056

@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
10921088

10931089
/*
10941090
* We account for memory if it's a private writeable mapping,
1095-
* and VM_NORESERVE wasn't set.
1091+
* not hugepages and VM_NORESERVE wasn't set.
10961092
*/
1097-
static inline int accountable_mapping(unsigned int vm_flags)
1093+
static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
10981094
{
1095+
/*
1096+
* hugetlb has its own accounting separate from the core VM
1097+
* VM_HUGETLB may not be set yet so we cannot check for that flag.
1098+
*/
1099+
if (file && is_file_hugepages(file))
1100+
return 0;
1101+
10991102
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
11001103
}
11011104

11021105
unsigned long mmap_region(struct file *file, unsigned long addr,
11031106
unsigned long len, unsigned long flags,
1104-
unsigned int vm_flags, unsigned long pgoff,
1105-
int accountable)
1107+
unsigned int vm_flags, unsigned long pgoff)
11061108
{
11071109
struct mm_struct *mm = current->mm;
11081110
struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
11281130

11291131
/*
11301132
* Set 'VM_NORESERVE' if we should not account for the
1131-
* memory use of this mapping. We only honor MAP_NORESERVE
1132-
* if we're allowed to overcommit memory.
1133+
* memory use of this mapping.
11331134
*/
1134-
if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1135-
vm_flags |= VM_NORESERVE;
1136-
if (!accountable)
1137-
vm_flags |= VM_NORESERVE;
1135+
if ((flags & MAP_NORESERVE)) {
1136+
/* We honor MAP_NORESERVE if allowed to overcommit */
1137+
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1138+
vm_flags |= VM_NORESERVE;
1139+
1140+
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
1141+
if (file && is_file_hugepages(file))
1142+
vm_flags |= VM_NORESERVE;
1143+
}
11381144

11391145
/*
11401146
* Private writable mapping: check memory availability
11411147
*/
1142-
if (accountable_mapping(vm_flags)) {
1148+
if (accountable_mapping(file, vm_flags)) {
11431149
charged = len >> PAGE_SHIFT;
11441150
if (security_vm_enough_memory(charged))
11451151
return -ENOMEM;

‎mm/mprotect.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
151151
/*
152152
* If we make a private mapping writable we increase our commit;
153153
* but (without finer accounting) cannot reduce our commit if we
154-
* make it unwritable again.
154+
* make it unwritable again. hugetlb mapping were accounted for
155+
* even if read-only so there is no need to account for them here
155156
*/
156157
if (newflags & VM_WRITE) {
157-
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
158+
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
158159
VM_SHARED|VM_NORESERVE))) {
159160
charged = nrpages;
160161
if (security_vm_enough_memory(charged))

0 commit comments

Comments
 (0)
Please sign in to comment.