如何将 Linux 内核缓冲区映射到用户空间？-IT科技

摘要：问题描述：假设缓冲区是使用基于页面的方案分配的。实现 mmap 的一种方法是使用 remap_pfn_range，但 LDD3 表示这不适用于常规内存。看来我们可以通过使用 SetPageReserved 将页面标记为保留来解决这个问题，这样它就会被锁定在内存中。但所有内核内存不是都已经不可交换即已经保留了吗...

问题描述：

假设缓冲区是使用基于页面的方案分配的。实现 mmap 的一种方法是使用 remap_pfn_range，但 LDD3 表示这不适用于常规内存。看来我们可以通过使用 SetPageReserved 将页面标记为保留来解决这个问题，这样它就会被锁定在内存中。但所有内核内存不是都已经不可交换即已经保留了吗？为什么需要明确设置保留位？

这与从 HIGH_MEM 分配的页面有关吗？

解决方案 1：

在 mmap 方法中从内核映射一组页面的最简单方法是使用故障处理程序来映射页面。基本上，您最终会得到类似以下内容的结果：

static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
    vma->vm_ops = &amp;my_vm_ops;
    return 0;
}

static const struct file_operations my_fops = {
    .owner  = THIS_MODULE,
    .open   = nonseekable_open,
    .mmap   = my_mmap,
    .llseek = no_llseek,
};

（其中其他文件操作是模块所需的）。此外，my_mmap您还会进行任何范围检查等，以验证 mmap 参数。

然后vm_ops看起来像：

static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
    vmf->page = my_page_at_index(vmf->pgoff);
    get_page(vmf->page);

    return 0;
} 

static const struct vm_operations_struct my_vm_ops = {
    .fault      = my_fault
}

你只需要确定传递给你的故障函数的给定 vma/vmf 中哪个页面映射到用户空间。这取决于你的模块的具体工作方式。例如，如果你这样做

my_buf = vmalloc_user(MY_BUF_SIZE);

那么你使用的页面将是这样的

vmalloc_to_page(my_buf + (vmf->pgoff &lt;&lt; PAGE_SHIFT));

但是您可以轻松创建一个数组并为每个条目分配一个页面，使用 kmalloc 等等。

[刚刚注意到这个my_fault函数的名字有点搞笑]

解决方案 2：

最小可运行示例和用户空间测试

内核模块：

#include &lt;linux/fs.h>
#include &lt;linux/init.h>
#include &lt;linux/kernel.h> /* min */
#include &lt;linux/mm.h>
#include &lt;linux/module.h>
#include &lt;linux/proc_fs.h>
#include &lt;linux/uaccess.h> /* copy_from_user, copy_to_user */
#include &lt;linux/slab.h>

static const char *filename = &quot;lkmc_mmap&quot;;

enum { BUFFER_SIZE = 4 };

struct mmap_info {
    char *data;
};

/* After unmap. */
static void vm_close(struct vm_area_struct *vma)
{
    pr_info(&quot;vm_close
&quot;);
}

/* First page access. */
static vm_fault_t vm_fault(struct vm_fault *vmf)
{
    struct page *page;
    struct mmap_info *info;

    pr_info(&quot;vm_fault
&quot;);
    info = (struct mmap_info *)vmf->vma->vm_private_data;
    if (info->data) {
        page = virt_to_page(info->data);
        get_page(page);
        vmf->page = page;
    }
    return 0;
}

/* After mmap. TODO vs mmap, when can this happen at a different time than mmap? */
static void vm_open(struct vm_area_struct *vma)
{
    pr_info(&quot;vm_open
&quot;);
}

static struct vm_operations_struct vm_ops =
{
    .close = vm_close,
    .fault = vm_fault,
    .open = vm_open,
};

static int mmap(struct file *filp, struct vm_area_struct *vma)
{
    pr_info(&quot;mmap
&quot;);
    vma->vm_ops = &amp;vm_ops;
    vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
    vma->vm_private_data = filp->private_data;
    vm_open(vma);
    return 0;
}

static int open(struct inode *inode, struct file *filp)
{
    struct mmap_info *info;

    pr_info(&quot;open
&quot;);
    info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
    pr_info(&quot;virt_to_phys = 0x%llx
&quot;, (unsigned long long)virt_to_phys((void *)info));
    info->data = (char *)get_zeroed_page(GFP_KERNEL);
    memcpy(info->data, &quot;asdf&quot;, BUFFER_SIZE);
    filp->private_data = info;
    return 0;
}

static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off)
{
    struct mmap_info *info;
    ssize_t ret;

    pr_info(&quot;read
&quot;);
    if ((size_t)BUFFER_SIZE &lt;= *off) {
        ret = 0;
    } else {
        info = filp->private_data;
        ret = min(len, (size_t)BUFFER_SIZE - (size_t)*off);
        if (copy_to_user(buf, info->data + *off, ret)) {
            ret = -EFAULT;
        } else {
            *off += ret;
        }
    }
    return ret;
}

static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off)
{
    struct mmap_info *info;

    pr_info(&quot;write
&quot;);
    info = filp->private_data;
    if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) {
        return -EFAULT;
    } else {
        return len;
    }
}

static int release(struct inode *inode, struct file *filp)
{
    struct mmap_info *info;

    pr_info(&quot;release
&quot;);
    info = filp->private_data;
    free_page((unsigned long)info->data);
    kfree(info);
    filp->private_data = NULL;
    return 0;
}

static const struct file_operations fops = {
    .mmap = mmap,
    .open = open,
    .release = release,
    .read = read,
    .write = write,
};

static int myinit(void)
{
    proc_create(filename, 0, NULL, &amp;fops);
    return 0;
}

static void myexit(void)
{
    remove_proc_entry(filename, NULL);
}

module_init(myinit)
module_exit(myexit)
MODULE_LICENSE(&quot;GPL&quot;);

GitHub 上游。

用户空间测试：

#define _XOPEN_SOURCE 700
#include &lt;assert.h>
#include &lt;fcntl.h>
#include &lt;stdio.h>
#include &lt;stdlib.h>
#include &lt;stdint.h> /* uintmax_t */
#include &lt;string.h>
#include &lt;sys/mman.h>
#include &lt;unistd.h> /* sysconf */

/* Format documented at:
 * https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt
 */
typedef struct {
    uint64_t pfn : 54;
    unsigned int soft_dirty : 1;
    unsigned int file_page : 1;
    unsigned int swapped : 1;
    unsigned int present : 1;
} PagemapEntry;

/* Parse the pagemap entry for the given virtual address.
 *
 * @param[out] entry      the parsed entry
 * @param[in]  pagemap_fd file descriptor to an open /proc/pid/pagemap file
 * @param[in]  vaddr      virtual address to get entry for
 * @return                0 for success, 1 for failure
 */
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
    size_t nread;
    ssize_t ret;
    uint64_t data;

    nread = 0;
    while (nread &lt; sizeof(data)) {
        ret = pread(pagemap_fd, ((uint8_t*)&amp;data) + nread, sizeof(data),
                (vaddr / sysconf(_SC_PAGE_SIZE)) * sizeof(data) + nread);
        nread += ret;
        if (ret &lt;= 0) {
            return 1;
        }
    }
    entry->pfn = data &amp; (((uint64_t)1 &lt;&lt; 54) - 1);
    entry->soft_dirty = (data >> 54) &amp; 1;
    entry->file_page = (data >> 61) &amp; 1;
    entry->swapped = (data >> 62) &amp; 1;
    entry->present = (data >> 63) &amp; 1;
    return 0;
}

/* Convert the given virtual address to physical using /proc/PID/pagemap.
 *
 * @param[out] paddr physical address
 * @param[in]  pid   process to convert for
 * @param[in] vaddr  virtual address to get entry for
 * @return           0 for success, 1 for failure
 */
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
    char pagemap_file[BUFSIZ];
    int pagemap_fd;

    snprintf(pagemap_file, sizeof(pagemap_file), &quot;/proc/%ju/pagemap&quot;, (uintmax_t)pid);
    pagemap_fd = open(pagemap_file, O_RDONLY);
    if (pagemap_fd &lt; 0) {
        return 1;
    }
    PagemapEntry entry;
    if (pagemap_get_entry(&amp;entry, pagemap_fd, vaddr)) {
        return 1;
    }
    close(pagemap_fd);
    *paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
    return 0;
}

enum { BUFFER_SIZE = 4 };

int main(int argc, char **argv)
{
    int fd;
    long page_size;
    char *address1, *address2;
    char buf[BUFFER_SIZE];
    uintptr_t paddr;

    if (argc &lt; 2) {
        printf(&quot;Usage: %s &lt;mmap_file>
&quot;, argv[0]);
        return EXIT_FAILURE;
    }
    page_size = sysconf(_SC_PAGE_SIZE);
    printf(&quot;open pathname = %s
&quot;, argv[1]);
    fd = open(argv[1], O_RDWR | O_SYNC);
    if (fd &lt; 0) {
        perror(&quot;open&quot;);
        assert(0);
    }
    printf(&quot;fd = %d
&quot;, fd);

    /* mmap twice for double fun. */
    puts(&quot;mmap 1&quot;);
    address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (address1 == MAP_FAILED) {
        perror(&quot;mmap&quot;);
        assert(0);
    }
    puts(&quot;mmap 2&quot;);
    address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (address2 == MAP_FAILED) {
        perror(&quot;mmap&quot;);
        return EXIT_FAILURE;
    }
    assert(address1 != address2);

    /* Read and modify memory. */
    puts(&quot;access 1&quot;);
    assert(!strcmp(address1, &quot;asdf&quot;));
    /* vm_fault */
    puts(&quot;access 2&quot;);
    assert(!strcmp(address2, &quot;asdf&quot;));
    /* vm_fault */
    strcpy(address1, &quot;qwer&quot;);
    /* Also modified. So both virtual addresses point to the same physical address. */
    assert(!strcmp(address2, &quot;qwer&quot;));

    /* Check that the physical addresses are the same.
     * They are, but TODO why virt_to_phys on kernel gives a different value? */
    assert(!virt_to_phys_user(&amp;paddr, getpid(), (uintptr_t)address1));
    printf(&quot;paddr1 = 0x%jx
&quot;, (uintmax_t)paddr);
    assert(!virt_to_phys_user(&amp;paddr, getpid(), (uintptr_t)address2));
    printf(&quot;paddr2 = 0x%jx
&quot;, (uintmax_t)paddr);

    /* Check that modifications made from userland are also visible from the kernel. */
    read(fd, buf, BUFFER_SIZE);
    assert(!memcmp(buf, &quot;qwer&quot;, BUFFER_SIZE));

    /* Modify the data from the kernel, and check that the change is visible from userland. */
    write(fd, &quot;zxcv&quot;, 4);
    assert(!strcmp(address1, &quot;zxcv&quot;));
    assert(!strcmp(address2, &quot;zxcv&quot;));

    /* Cleanup. */
    puts(&quot;munmap 1&quot;);
    if (munmap(address1, page_size)) {
        perror(&quot;munmap&quot;);
        assert(0);
    }
    puts(&quot;munmap 2&quot;);
    if (munmap(address2, page_size)) {
        perror(&quot;munmap&quot;);
        assert(0);
    }
    puts(&quot;close&quot;);
    close(fd);
    return EXIT_SUCCESS;
}

GitHub 上游。

在内核 5.4.3 上测试。