内核版本:5.15.0-205.149.5.1.el8uek.aarch64(5.15系列全部) 操作系统:Oracle 8 ARM64
我正在尝试连接到 sys_call_table,同样我正在修改 sys_call_table 的读写权限。此代码失败(返回 -22):
// SPDX-License-Identifier: GPL-3.0
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <linux/kallsyms.h> // kallsyms_lookup_name()
#include <asm/syscall.h> // syscall_fn_t, __NR_*
#include <asm/ptrace.h> // struct pt_regs
#include <asm/tlbflush.h> // flush_tlb_kernel_range()
#include <asm/pgtable.h> // {clear,set}_pte_bit(), set_pte()
#include <linux/vmalloc.h> // vm_unmap_aliases()
#include <linux/mm.h> // struct mm_struct, apply_to_page_range()
#include <linux/kconfig.h> // IS_ENABLED()
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
static struct mm_struct *init_mm_ptr;
static syscall_fn_t *syscall_table;
static syscall_fn_t original_read;
/********** HELPERS **********/
// From arch/arm64/mm/pageattr.c.
struct page_change_data {
pgprot_t set_mask;
pgprot_t clear_mask;
};
// From arch/arm64/mm/pageattr.c.
static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
{
struct page_change_data *cdata = data;
pte_t pte = READ_ONCE(*ptep);
pte = clear_pte_bit(pte, cdata->clear_mask);
pte = set_pte_bit(pte, cdata->set_mask);
set_pte(ptep, pte);
return 0;
}
// From arch/arm64/mm/pageattr.c.
static int __change_memory_common(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
struct page_change_data data;
int ret;
data.set_mask = set_mask;
data.clear_mask = clear_mask;
ret = apply_to_page_range(init_mm_ptr, start, size, change_page_range, &data);
flush_tlb_kernel_range(start, start + size);
return ret;
}
// Simplified set_memory_rw() from arch/arm64/mm/pageattr.c.
static int set_page_rw(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE, __pgprot(PTE_WRITE), __pgprot(PTE_RDONLY));
}
// Simplified set_memory_ro() from arch/arm64/mm/pageattr.c.
static int set_page_ro(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE, __pgprot(PTE_RDONLY), __pgprot(PTE_WRITE));
}
/********** ACTUAL MODULE **********/
static long myread(const struct pt_regs *regs)
{
pr_info("read() called\n");
return original_read(regs);
}
static int __init modinit(void)
{
int res;
pr_info("init\n");
// Shouldn't fail.
init_mm_ptr = (struct mm_struct *)kallsyms_lookup_name("init_mm");
syscall_table = (syscall_fn_t *)kallsyms_lookup_name("sys_call_table");
original_read = syscall_table[__NR_read];
res = set_page_rw((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return res;
}
syscall_table[__NR_read] = myread;
res = set_page_ro((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_ro() failed: %d\n", res);
return res;
}
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
int res;
pr_info("exit\n");
res = set_page_rw((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return;
}
syscall_table[__NR_read] = original_read;
res = set_page_ro((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0)
pr_err("set_page_ro() failed: %d\n", res);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_LICENSE("GPL");
(参考自https://stackoverflow.com/a/61465861/17435873)
但是相同的代码对于内核版本 5.4.17-2136.331.7.el8uek.aarch64 可以正常工作(所有 5.4 系列及以下版本)
我使用 vmalloc 创建了一个内核内存页。我可以使用相同的功能将它们变成读写和只读。
代码:
char *foo;
static int __init modinit(void)
{
int res;
pr_info("init\n");
// Shouldn't fail.
init_mm_ptr = (struct mm_struct *)kallsyms_lookup_name("init_mm");
syscall_table = (syscall_fn_t *)kallsyms_lookup_name("sys_call_table");
foo = vmalloc(PAGE_SIZE);
res = set_page_ro((unsigned long)foo); // Returns 0 (success)
foo[420] = '1'; // Crashed here
res = set_page_rw((unsigned long)foo);
foo[420] = '1'; // No crashes, set_page_rw returns 0 (success)
pr_info("init done\n");
return 0;
}
更新:
所以我深入研究了这些函数:apply_to_page_range、apply_to_p4d_range、apply_to_pud_range、apply_to_pmd_range、apply_to_pte_range。 (全部复制源码制作自定义功能)。
函数栈:
!pmd_table 对于 sys_call_table 返回 1,对于 vmalloc 返回 0(唯一的区别)
自定义功能:
int my_pmd_leaf(pmd_t pmd){
printk("inside %s\n", __FUNCTION__);
int a1 = pmd_present(pmd), a2 = !pmd_table(pmd), a3 = a1 && a2;
printk("a1=%d, a2=%d, a3=%d\n", a1, a2, a3);
return a3;
}
输出:
vmalloc page
[71627.367740] inside my_pmd_leaf
[71627.369595] a1=1, a2=0, a3=0
sys_call_table
[71756.857568] inside my_pmd_leaf
[71756.859347] a1=1, a2=1, a3=1
我在 QEMU 下运行了完全相同的内核并测试了该模块。如果您查看内核日志,您会很容易注意到问题:
[ 31.115415] test: init
[ 31.159343] ------------[ cut here ]------------
[ 31.159451] WARNING: CPU: 0 PID: 131 at mm/memory.c:2743 apply_to_pmd_range+0xf8/0x1d0
[ 31.159654] Modules linked in: test(OE+)
[ 31.159974] CPU: 0 PID: 131 Comm: insmod Tainted: G OE 5.15.0-205.149.5.1.el8uek.aarch64 #2
[ 31.160186] Hardware name: linux,dummy-virt (DT)
[ 31.160383] pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 31.160512] pc : apply_to_pmd_range+0xf8/0x1d0
[ 31.160617] lr : apply_to_pmd_range+0x50/0x1d0
[ 31.160754] sp : ffff80000bedba00
[ 31.160867] x29: ffff80000bedba00 x28: ffff0000ffffe250 x27: ffff800009520000
[ 31.161086] x26: 0000000000000000 x25: ffff80000951ffff x24: ffff80000bedbb88
[ 31.161256] x23: ffff8000010f0000 x22: ffff80000ac244a8 x21: ffff80000bedbae4
[ 31.161369] x20: 0000000000000001 x19: ffff800009520000 x18: 0000000000000000
[ 31.161482] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[ 31.161592] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
[ 31.161701] x11: 0000000000000000 x10: 0000000000000000 x9 : ffff80000837d010
[ 31.161829] x8 : 0000000000000000 x7 : ffff80000bedbae4 x6 : 0000000000000001
[ 31.161942] x5 : ffff80000bedbb88 x4 : ffff8000010f0000 x3 : 0c00000000000001
[ 31.162055] x2 : ffff80000951f000 x1 : 0000000000000001 x0 : 0060000041600781
[ 31.162280] Call trace:
[ 31.162468] apply_to_pmd_range+0xf8/0x1d0
[ 31.162567] apply_to_pud_range+0x9c/0x1f4
[ 31.162637] __apply_to_page_range+0xb8/0x190
[ 31.162707] apply_to_page_range+0x1c/0x44
[ 31.162772] __change_memory_common.constprop.0+0x58/0xe0 [test]
[ 31.163198] set_page_rw+0x30/0x50 [test]
[ 31.163343] modinit+0x78/0x1000 [test]
[ 31.163486] do_one_initcall+0x64/0x200
[ 31.163556] do_init_module+0x50/0x27c
[ 31.163618] load_module+0xa20/0xb70
[ 31.163678] __do_sys_init_module+0xe8/0x180
[ 31.163748] __arm64_sys_init_module+0x24/0x40
[ 31.163814] invoke_syscall+0x50/0x15c
[ 31.163871] el0_svc_common+0x48/0x144
[ 31.163925] do_el0_svc+0x30/0xe0
[ 31.163976] el0_svc+0x30/0xf0
[ 31.164029] el0t_64_sync_handler+0xc4/0x148
[ 31.164091] el0t_64_sync+0x1a4/0x1a8
[ 31.164238] ---[ end trace 202353dcbe129dc1 ]---
[ 31.164562] test: set_page_rw() failed: -22
您很可能会触发
WARN_ON_ONCE(pmd_leaf(*pmd))
中的 apply_to_pmd_range()
(参见此处)。这是因为内核页表在大页内有系统调用表(这是有道理的,这节省了时间和内存),但__change_memory_common()
函数根本不处理大页。每当遇到巨大的映射(pud_leaf()
或pmd_leaf()
)时,它就会用-EINVAL
来退出。
我重新实现了这些函数,通过传递额外的回调函数来处理叶 PMD 来处理发现巨大 PMD 映射的情况 (
pmd_leaf()
)。我没有费心为 PUD 实现相同的功能,因为没有方便的辅助函数来处理 PUD,但这在技术上也是可行的。
另外,请注意,在此内核上,
kallsyms_lookup_name()
函数未导出,因此我使用此处描述的 kprobes 技巧在运行时找到它。
这是更新后的代码,在带有内核的 QEMU 上进行了测试5.15.0-205.149.5.1.el8uek.aarch64
,它似乎工作正常。
// SPDX-License-Identifier: GPL-3.0
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <linux/kallsyms.h> // kallsyms_lookup_name()
#include <asm/syscall.h> // syscall_fn_t, __NR_*
#include <asm/ptrace.h> // struct pt_regs
#include <asm/tlbflush.h> // flush_tlb_kernel_range()
#include <asm/pgtable.h> // {clear,set}_pte_bit(), set_pte()
#include <linux/vmalloc.h> // vm_unmap_aliases()
#include <linux/mm.h> // struct mm_struct, apply_to_page_range()
#include <linux/kprobes.h> // register_kprobe(), unregister_kprobe()
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
static struct mm_struct *init_mm_ptr;
static syscall_fn_t *syscall_table;
static syscall_fn_t original_read;
/***** HELPERS ****************************************************************/
/**
* This is an enhanced implementation of __apply_to_page_range() that is also
* capable of handling huge PMD mappings (pmd_leaf()). The original
* implementation of __apply_to_page_range() only handles last-level PTEs, and
* fails with -EINVAL for PMD mappings. This implementation takes 2 function
* pointers instead of a single one:
*
* - pte_fn_t fn_pte: function to apply changes to a leaf PTE
* - pmd_fn_t fn_pmd: function to apply changes to a leaf PMD
*/
// pte_fn_t already present in <linux/mm.h>
typedef int (*pmd_fn_t)(pmd_t *pmd, unsigned long addr, void *data);
typedef int (*pud_fn_t)(pud_t *pud, unsigned long addr, void *data);
// From arch/arm64/mm/hugetlbpage.c
int pmd_huge(pmd_t pmd)
{
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
}
// Adapted from arch/arm64/mm/hugetlbpage.c
int pud_huge(pud_t pud)
{
#if CONFIG_PGTABLE_LEVELS == 2
return 0;
#else
return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT);
#endif
}
// From arch/arm64/mm/pageattr.c.
struct page_change_data {
pgprot_t set_mask;
pgprot_t clear_mask;
};
// From arch/arm64/mm/pageattr.c.
static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
{
struct page_change_data *cdata = data;
pte_t pte = READ_ONCE(*ptep);
pte = clear_pte_bit(pte, cdata->clear_mask);
pte = set_pte_bit(pte, cdata->set_mask);
set_pte(ptep, pte);
return 0;
}
static int change_pmd_range(pmd_t *pmdp, unsigned long addr, void *data)
{
struct page_change_data *cdata = data;
pmd_t pmd = READ_ONCE(*pmdp);
pmd = clear_pmd_bit(pmd, cdata->clear_mask);
pmd = set_pmd_bit(pmd, cdata->set_mask);
set_pmd(pmdp, pmd);
return 0;
}
// Adapted from mm/memory.c
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data,
pgtbl_mod_mask *mask)
{
pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
mapped_pte = pte = (mm == init_mm_ptr) ? pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
BUG_ON(pmd_huge(*pmd));
arch_enter_lazy_mmu_mode();
if (fn) {
do {
if (!pte_none(*pte)) {
err = fn(pte++, addr, data);
if (err)
break;
}
} while (addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
if (mm != init_mm_ptr)
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
// Adapted from mm/memory.c
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
pte_fn_t fn_pte, pmd_fn_t fn_pmd,
void *data, pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
BUG_ON(pud_huge(*pud));
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd))
continue;
if (pmd_leaf(*pmd)) {
if (!fn_pmd || pmd_none(*pmd))
continue;
err = fn_pmd(pmd, addr, data);
if (err)
break;
} else {
if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd)))
continue;
err = apply_to_pte_range(mm, pmd, addr, next, fn_pte,
data, mask);
if (err)
break;
}
} while (pmd++, addr = next, addr != end);
return err;
}
// Adapted from mm/memory.c
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
pte_fn_t fn_pte, pmd_fn_t fn_pmd,
void *data, pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int err = 0;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none(*pud))
continue;
if (WARN_ON_ONCE(pud_leaf(*pud)))
return -EINVAL;
if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud)))
continue;
err = apply_to_pmd_range(mm, pud, addr, next, fn_pte,
fn_pmd, data, mask);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
// Adapted from mm/memory.c
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
pte_fn_t fn_pte, pmd_fn_t fn_pmd,
void *data, pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none(*p4d))
continue;
if (WARN_ON_ONCE(p4d_leaf(*p4d)))
return -EINVAL;
if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d)))
continue;
err = apply_to_pud_range(mm, p4d, addr, next, fn_pte, fn_pmd,
data, mask);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
// Adapted from mm/memory.c
static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size,
pte_fn_t fn_pte, pmd_fn_t fn_pmd,
void *data)
{
pgd_t *pgd;
unsigned long start = addr, next;
unsigned long end = addr + size;
pgtbl_mod_mask mask = 0;
int err = 0;
if (WARN_ON(addr >= end))
return -EINVAL;
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd))
continue;
if (WARN_ON_ONCE(pgd_leaf(*pgd)))
return -EINVAL;
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd)))
continue;
err = apply_to_p4d_range(mm, pgd, addr, next, fn_pte, fn_pmd,
data, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, start + size);
return err;
}
// Adapted from arch/arm64/mm/pageattr.c.
static int __change_memory_common(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
struct page_change_data data;
int ret;
data.set_mask = set_mask;
data.clear_mask = clear_mask;
ret = __apply_to_page_range(init_mm_ptr, start, size,
&change_page_range, &change_pmd_range, &data);
if (ret)
pr_info("__apply_to_page_range() failed: %d\n", ret);
flush_tlb_kernel_range(start, start + size);
return ret;
}
// Simplified version of set_memory_rw() from arch/arm64/mm/pageattr.c.
static int set_page_rw(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE, __pgprot(PTE_WRITE), __pgprot(PTE_RDONLY));
}
// Simplified version of set_memory_ro() from arch/arm64/mm/pageattr.c.
static int set_page_ro(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE, __pgprot(PTE_RDONLY), __pgprot(PTE_WRITE));
}
/***** ACTUAL MODULE **********************************************************/
static long myread(const struct pt_regs *regs)
{
pr_info("read() called\n");
return original_read(regs);
}
typedef unsigned long (*kallsyms_lookup_name_t)(const char *);
static int __init modinit(void)
{
struct kprobe kp = { .symbol_name = "kallsyms_lookup_name" };
kallsyms_lookup_name_t kallsyms_lookup_name;
int res;
pr_info("init\n");
// Workaround for kallsyms_lookup_name() not being exported: find it
// using kprobes.
res = register_kprobe(&kp);
if (res != 0) {
pr_err("register_kprobe() failed: %d\n", res);
return res;
}
kallsyms_lookup_name = (kallsyms_lookup_name_t)kp.addr;
unregister_kprobe(&kp);
init_mm_ptr = (struct mm_struct *)kallsyms_lookup_name("init_mm");
if (!init_mm_ptr) {
pr_err("init_mm not found\n");
return -ENOSYS;
}
syscall_table = (syscall_fn_t *)kallsyms_lookup_name("sys_call_table");
if (!syscall_table) {
pr_err("sys_call_table not found\n");
return -ENOSYS;
}
original_read = syscall_table[__NR_read];
res = set_page_rw((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return res;
}
syscall_table[__NR_read] = myread;
res = set_page_ro((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_ro() failed: %d\n", res);
return res;
}
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
int res;
pr_info("exit\n");
res = set_page_rw((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return;
}
syscall_table[__NR_read] = original_read;
res = set_page_ro((unsigned long)(syscall_table + __NR_read) & PAGE_MASK);
if (res != 0)
pr_err("set_page_ro() failed: %d\n", res);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_AUTHOR("Marco Bonelli");
MODULE_DESCRIPTION("Syscall hijack on arm64.");
MODULE_LICENSE("GPL");