[PATCH 2/2] x86/modules: Make x86 allocs to flush when free

Discussion:

Rick Edgecombe

2018-11-28 00:07:54 UTC

Change the module allocations to flush before freeing the pages.

Signed-off-by: Rick Edgecombe <***@intel.com>
---
arch/x86/kernel/module.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b052e883dd8c..1694daf256b3 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -87,8 +87,8 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL,
- PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
+ PAGE_KERNEL_EXEC, VM_IMMEDIATE_UNMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) {
vfree(p);
return NULL;

--
2.17.1

Rick Edgecombe

2018-11-28 00:07:53 UTC

Permalink

Since vfree will lazily flush the TLB, but not lazily free the underlying pages,
it often leaves stale TLB entries to freed pages that could get re-used. This is
undesirable for cases where the memory being freed has special permissions such
as executable.

Having callers flush the TLB after calling vfree still leaves a window where
the pages are freed, but the TLB entry remains. Also the entire operation can be
deferred if the vfree is called from an interrupt and so a TLB flush after
calling vfree would miss the entire operation. So in order to support this use
case, a new flag VM_IMMEDIATE_UNMAP is added, that will cause the free operation
to take place like this:
1. Unmap
2. Flush TLB/Unmap aliases
3. Free pages
In the deferred case these steps are all done by the work queue.

This implementation derives from two sketches from Dave Hansen and
Andy Lutomirski.

Suggested-by: Dave Hansen <***@intel.com>
Suggested-by: Andy Lutomirski <***@kernel.org>
Suggested-by: Will Deacon <***@arm.com>
Signed-off-by: Rick Edgecombe <***@intel.com>
---
include/linux/vmalloc.h | 1 +
mm/vmalloc.c | 13 +++++++++++--
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 398e9c95cd61..cca6b6b83cf0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -21,6 +21,7 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
+#define VM_IMMEDIATE_UNMAP 0x00000200 /* flush before releasing pages */
/* bits [20..32] reserved for arch specific ioremap internals */

/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 97d4b25d0373..68766651b5a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1516,6 +1516,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));

remove_vm_area(addr);
+
+ /*
+ * Need to flush the TLB before freeing pages in the case of this flag.
+ * As long as that's happening, unmap aliases.
+ */
+ if (area->flags & VM_IMMEDIATE_UNMAP)
+ vm_unmap_aliases();
+
if (deallocate_pages) {
int i;

@@ -1925,8 +1933,9 @@ EXPORT_SYMBOL(vzalloc_node);

void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_IMMEDIATE_UNMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
}

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)

--
2.17.1

Edgecombe, Rick P

2018-12-04 00:04:21 UTC

Permalink

It looks like this new flag is in linux-next now. As I am reading it, these
architectures have a module_alloc that uses some sort of executable flag and
are not using the default module_alloc which is already covered, and so may need
it plugged in:
arm
arm64
parisc
s390
unicore32

Thanks,

Rick

Post by Rick Edgecombe
Since vfree will lazily flush the TLB, but not lazily free the underlying pages,
it often leaves stale TLB entries to freed pages that could get re-used. This is
undesirable for cases where the memory being freed has special permissions such
as executable.
Having callers flush the TLB after calling vfree still leaves a window where
the pages are freed, but the TLB entry remains. Also the entire operation can be
deferred if the vfree is called from an interrupt and so a TLB flush after
calling vfree would miss the entire operation. So in order to support this use
case, a new flag VM_IMMEDIATE_UNMAP is added, that will cause the free operation
1. Unmap
2. Flush TLB/Unmap aliases
3. Free pages
In the deferred case these steps are all done by the work queue.
This implementation derives from two sketches from Dave Hansen and
Andy Lutomirski.
---
include/linux/vmalloc.h | 1 +
mm/vmalloc.c | 13 +++++++++++--
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 398e9c95cd61..cca6b6b83cf0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -21,6 +21,7 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully
initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
+#define VM_IMMEDIATE_UNMAP 0x00000200 /* flush before releasing pages */
/* bits [20..32] reserved for arch specific ioremap internals */
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 97d4b25d0373..68766651b5a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1516,6 +1516,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
remove_vm_area(addr);
+
+ /*
+ * Need to flush the TLB before freeing pages in the case of this flag.
+ * As long as that's happening, unmap aliases.
+ */
+ if (area->flags & VM_IMMEDIATE_UNMAP)
+ vm_unmap_aliases();
+
if (deallocate_pages) {
int i;
@@ -1925,8 +1933,9 @@ EXPORT_SYMBOL(vzalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_IMMEDIATE_UNMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
}
#if defi

Nadav Amit

2018-12-04 01:43:11 UTC

Permalink

So I am trying to finish my patch-set for preventing transient W+X mappings
from taking space, by handling kprobes & ftrace that I missed (thanks again for
pointing it out).

But all of the sudden, I don’t understand why we have the problem that this
(your) patch-set deals with at all. We already change the mappings to make
the memory writable before freeing the memory, so why can’t we make it
non-executable at the same time? Actually, why do we make the module memory,
including its data executable before freeing it???

In other words: disable_ro_nx() is called by free_module() before freeing
the memory. Wouldn’t inverting the logic makes much more sense? I am
confused.

-- >8 --

From: Nadav Amit <***@vmware.com>
Subject: [PATCH] modules: disable_ro_nx() should enable nx

---
kernel/module.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 7cb207249437..e12d760ea3b0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2029,14 +2029,13 @@ void set_all_modules_text_ro(void)

static void disable_ro_nx(const struct module_layout *layout)
{
+ frob_text(layout, set_memory_nx);
+
if (rodata_enabled) {
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_ro_after_init(layout, set_memory_rw);
}
- frob_rodata(layout, set_memory_x);
- frob_ro_after_init(layout, set_memory_x);
- frob_writable_data(layout, set_memory_x);
}

#else

--
2.17.1

Will Deacon

2018-12-04 16:03:04 UTC