blob: d37938bcdc973100dfc3524c7f167f3363ecfc3f [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
* Provide the pin memory method for check point and restore task.
*/
#ifdef CONFIG_PIN_MEMORY
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/sched/cputime.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/pin_mem.h>
#include <linux/idr.h>
#include <linux/page-isolation.h>
#include <linux/sched/mm.h>
#include <linux/ctype.h>
#include <linux/highmem.h>
#include <crypto/sha2.h>
#include <linux/memblock.h>
#define MAX_PIN_PID_NUM 128
#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000
static DEFINE_SPINLOCK(page_map_entry_lock);
static DEFINE_MUTEX(pin_mem_mutex);
static struct pin_mem_dump_info *pin_mem_dump_start;
static unsigned int pin_pid_num;
static unsigned int *pin_pid_num_addr;
static struct page_map_entry *__page_map_entry_start;
static unsigned long page_map_entry_end;
static struct page_map_info *user_space_reserve_start;
static struct page_map_entry *page_map_entry_start;
unsigned int max_pin_pid_num __read_mostly;
unsigned long redirect_space_size __read_mostly;
static unsigned long redirect_space_start;
static void *pin_mem_pagewalk;
static unsigned long *pagemap_buffer;
static int reserve_user_map_pages_fail;
static int __init setup_max_pin_pid_num(char *str)
{
int ret;
if (!str)
return 0;
ret = kstrtouint(str, 10, &max_pin_pid_num);
if (ret) {
pr_warn("Unable to parse max pin pid num.\n");
} else {
if (max_pin_pid_num > MAX_PIN_PID_NUM) {
max_pin_pid_num = 0;
pr_warn("Input max_pin_pid_num is too large.\n");
}
}
return ret;
}
early_param("max_pin_pid_num", setup_max_pin_pid_num);
static int __init setup_redirect_space_size(char *str)
{
if (!str)
return 0;
redirect_space_size = memparse(str, NULL);
if (!redirect_space_size) {
pr_warn("Unable to parse redirect space size, use the default value.\n");
redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE;
}
return 0;
}
early_param("redirect_space_size", setup_redirect_space_size);
static struct page_map_info *create_page_map_info(int pid)
{
struct page_map_info *new;
if (!user_space_reserve_start)
return NULL;
if (pin_pid_num >= max_pin_pid_num) {
pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid);
return NULL;
}
new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num);
new->pid = pid;
new->pme = NULL;
new->entry_num = 0;
new->pid_reserved = false;
new->disable_free_page = false;
(*pin_pid_num_addr)++;
pin_pid_num++;
return new;
}
struct page_map_info *create_page_map_info_by_pid(int pid)
{
unsigned long flags;
struct page_map_info *ret;
spin_lock_irqsave(&page_map_entry_lock, flags);
ret = create_page_map_info(pid);
spin_unlock_irqrestore(&page_map_entry_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(create_page_map_info_by_pid);
static struct page_map_info *get_page_map_info(int pid)
{
int i;
if (!user_space_reserve_start)
return NULL;
for (i = 0; i < pin_pid_num; i++) {
if (user_space_reserve_start[i].pid == pid)
return &(user_space_reserve_start[i]);
}
return NULL;
}
struct page_map_info *get_page_map_info_by_pid(int pid)
{
unsigned long flags;
struct page_map_info *ret;
spin_lock_irqsave(&page_map_entry_lock, flags);
ret = get_page_map_info(pid);
spin_unlock_irqrestore(&page_map_entry_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(get_page_map_info_by_pid);
static struct page *find_head_page(struct page *page)
{
struct page *p = page;
while (!PageBuddy(p)) {
if (PageLRU(p))
return NULL;
p--;
}
return p;
}
static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page,
unsigned long size, int order)
{
unsigned long cur_size = 1 << order;
unsigned long total_size = 0;
while (size && cur_size > size) {
cur_size >>= 1;
order--;
area--;
if (cur_size <= size) {
list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]);
atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
set_page_private(&page[total_size], order);
set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE);
area->nr_free++;
total_size += cur_size;
size -= cur_size;
}
}
}
static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page,
unsigned long size, int order)
{
unsigned long cur_size = 1 << order;
struct page *right_page, *head_page;
right_page = page + size;
while (size && cur_size > size) {
cur_size >>= 1;
order--;
area--;
if (cur_size <= size) {
head_page = right_page - cur_size;
list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]);
atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
set_page_private(head_page, order);
set_pageblock_migratetype(head_page, MIGRATE_MOVABLE);
area->nr_free++;
size -= cur_size;
right_page = head_page;
}
}
}
void reserve_page_from_buddy(unsigned long nr_pages, struct page *page)
{
unsigned int current_order;
struct page *page_end;
struct free_area *area;
struct zone *zone;
struct page *head_page;
head_page = find_head_page(page);
if (!head_page) {
pr_warn("Find page head fail.");
return;
}
current_order = head_page->private;
page_end = head_page + (1 << current_order);
zone = page_zone(head_page);
area = &(zone->free_area[current_order]);
list_del(&head_page->lru);
atomic_set(&head_page->_mapcount, -1);
set_page_private(head_page, 0);
area->nr_free--;
if (head_page != page)
spilt_page_area_left(zone, area, head_page,
(unsigned long)(page - head_page), current_order);
page = page + nr_pages;
if (page < page_end) {
spilt_page_area_right(zone, area, page,
(unsigned long)(page_end - page), current_order);
} else if (page > page_end) {
pr_warn("Find page end smaller than page.");
}
}
static inline void reserve_user_normal_pages(struct page *page)
{
atomic_inc(&page->_refcount);
reserve_page_from_buddy(1, page);
}
static void init_huge_pmd_pages(struct page *head_page)
{
int i = 0;
struct page *page = head_page;
unsigned long compound_pad = COMPOUND_PAD_START;
__set_bit(PG_head, &page->flags);
__set_bit(PG_active, &page->flags);
atomic_set(&page->_refcount, 1);
page++;
i++;
page->compound_head = (unsigned long)head_page + 1;
page->compound_dtor = HUGETLB_PAGE_DTOR + 1;
page->compound_order = HPAGE_PMD_ORDER;
page++;
i++;
page->compound_head = (unsigned long)head_page + 1;
i++;
INIT_LIST_HEAD(&(page->deferred_list));
for (; i < HPAGE_PMD_NR; i++) {
page = head_page + i;
page->compound_head = (unsigned long)head_page + 1;
compound_pad += COMPOUND_PAD_DELTA;
}
}
static inline void reserve_user_huge_pmd_pages(struct page *page)
{
atomic_inc(&page->_refcount);
reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page);
init_huge_pmd_pages(page);
}
void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index)
{
unsigned int i, j, index, order;
struct page_map_info *pmi;
struct page_map_entry *pme;
struct page *page;
unsigned long phy_addr;
for (index = 0; index < pid_index; index++) {
pmi = &(user_space_reserve_start[index]);
pme = pmi->pme;
for (i = 0; i < pmi->entry_num; i++) {
for (j = 0; j < pme->nr_pages; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
pme = (struct page_map_entry *)next_pme(pme);
}
}
pmi = &(user_space_reserve_start[index]);
pme = pmi->pme;
for (i = 0; i < entry_index; i++) {
for (j = 0; j < pme->nr_pages; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
pme = (struct page_map_entry *)next_pme(pme);
}
for (j = 0; j < page_index; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
}
bool check_redirect_end_valid(struct redirect_info *redirect_start,
unsigned long max_redirect_page_num)
{
unsigned long redirect_end;
redirect_end = ((unsigned long)(redirect_start + 1) +
max_redirect_page_num * sizeof(unsigned int));
if (redirect_end > redirect_space_start + redirect_space_size)
return false;
return true;
}
static void reserve_user_space_map_pages(void)
{
struct page_map_info *pmi;
struct page_map_entry *pme;
unsigned int i, j, index;
struct page *page;
unsigned long flags;
unsigned long phy_addr;
unsigned long redirect_pages = 0;
struct redirect_info *redirect_start = (struct redirect_info *)redirect_space_start;
if (!user_space_reserve_start || !redirect_start)
return;
spin_lock_irqsave(&page_map_entry_lock, flags);
for (index = 0; index < pin_pid_num; index++) {
pmi = &(user_space_reserve_start[index]);
pme = pmi->pme;
for (i = 0; i < pmi->entry_num; i++) {
redirect_pages = 0;
if (!check_redirect_end_valid(redirect_start, pme->nr_pages))
redirect_start = NULL;
for (j = 0; j < pme->nr_pages; j++) {
phy_addr = pme->phy_addr_array[j];
if (!phy_addr)
continue;
page = phys_to_page(phy_addr);
if (atomic_read(&page->_refcount)) {
if ((page->flags & PAGE_FLAGS_CHECK_RESERVED)
&& !pme->redirect_start)
pme->redirect_start =
(unsigned long)redirect_start;
if (redirect_start &&
(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
redirect_start->redirect_index[redirect_pages] = j;
redirect_pages++;
continue;
} else {
reserve_user_map_pages_fail = 1;
pr_warn("Page %pK refcount %d large than zero, no need reserve.\n",
page, atomic_read(&page->_refcount));
goto free_pages;
}
}
if (!pme->is_huge_page)
reserve_user_normal_pages(page);
else
reserve_user_huge_pmd_pages(page);
}
pme = (struct page_map_entry *)next_pme(pme);
if (redirect_pages && redirect_start) {
redirect_start->redirect_pages = redirect_pages;
redirect_start = (struct redirect_info *)(
(unsigned long)(redirect_start + 1) +
redirect_start->redirect_pages * sizeof(unsigned int));
}
}
}
spin_unlock_irqrestore(&page_map_entry_lock, flags);
return;
free_pages:
free_user_map_pages(index, i, j);
spin_unlock_irqrestore(&page_map_entry_lock, flags);
}
int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest)
{
int i;
struct sha256_state sctx;
if (!digest)
digest = pmdi->sha_digest;
sha256_init(&sctx);
sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)),
sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE);
for (i = 0; i < pmdi->pin_pid_num; i++) {
sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])),
sizeof(struct page_map_info));
}
sha256_final(&sctx, digest);
return 0;
}
static int check_sha_digest(struct pin_mem_dump_info *pmdi)
{
int ret = 0;
char digest[SHA256_DIGEST_SIZE] = {0};
ret = calculate_pin_mem_digest(pmdi, digest);
if (ret) {
pr_warn("calculate pin mem digest fail:%d\n", ret);
return ret;
}
if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) {
pr_warn("pin mem dump info sha256 digest match error!\n");
return -EFAULT;
}
return ret;
}
/*
* The whole page map entry collect process must be Sequentially.
* The user_space_reserve_start points to the first page map info for
* the first dump task. And the page_map_entry_start points to
* the first page map entry of the first dump vma.
*/
static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len)
{
if (pin_mem_dump_start || !max_pin_pid_num) {
pr_warn("pin page map already init or max_pin_pid_num not set.\n");
return;
}
if (map_len < sizeof(struct pin_mem_dump_info) +
max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) {
pr_warn("pin memory reserved memblock too small.\n");
return;
}
if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num > max_pin_pid_num) ||
check_sha_digest(pmdi))
memset(pmdi, 0, sizeof(struct pin_mem_dump_info));
pin_mem_dump_start = pmdi;
pin_pid_num = pmdi->pin_pid_num;
pr_info("pin_pid_num: %d\n", pin_pid_num);
pin_pid_num_addr = &(pmdi->pin_pid_num);
user_space_reserve_start =
(struct page_map_info *)pmdi->pmi_array;
page_map_entry_start =
(struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num);
__page_map_entry_start = page_map_entry_start;
page_map_entry_end = (unsigned long)pmdi + map_len - redirect_space_size;
redirect_space_start = page_map_entry_end;
if (pin_pid_num > 0)
reserve_user_space_map_pages();
}
int finish_pin_mem_dump(void)
{
int ret;
if (!pin_mem_dump_start)
return -EFAULT;
pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC;
memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE);
ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL);
if (ret) {
pr_warn("calculate pin mem digest fail:%d\n", ret);
return ret;
}
return ret;
}
EXPORT_SYMBOL_GPL(finish_pin_mem_dump);
int collect_pmd_huge_pages(struct task_struct *task,
unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme)
{
int ret, i, res;
int index = 0;
unsigned long start = start_addr;
struct page *temp_page;
unsigned long *pte_entry = pagemap_buffer;
unsigned int count;
struct mm_struct *mm = task->mm;
while (start < end_addr) {
temp_page = NULL;
count = 0;
ret = pagemap_get(mm, pin_mem_pagewalk,
start, start + HPAGE_PMD_SIZE, pte_entry, &count);
if (ret || !count) {
pr_warn("Get huge page fail: %d.", ret);
return COLLECT_PAGES_FAIL;
}
/* For huge page, get one map entry per time. */
if ((pte_entry[0] & PM_SWAP) && (count == 1)) {
res = get_user_pages_remote(task->mm, start, 1,
FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL);
if (!res) {
pr_warn("Swap in huge page fail.\n");
return COLLECT_PAGES_FAIL;
}
pme->phy_addr_array[index] = page_to_phys(temp_page);
start += HPAGE_PMD_SIZE;
index++;
continue;
}
if (IS_PTE_PRESENT(pte_entry[0])) {
temp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK);
if (PageHead(temp_page)) {
SetPageHotreplace(temp_page);
atomic_inc(&((temp_page)->_refcount));
start += HPAGE_PMD_SIZE;
pme->phy_addr_array[index] = page_to_phys(temp_page);
index++;
} else {
/* If the page is not compound head, goto collect normal pages. */
pme->nr_pages = index;
return COLLECT_PAGES_NEED_CONTINUE;
}
} else {
for (i = 1; i < count; i++) {
if (pte_entry[i] & PM_PFRAME_MASK) {
pme->nr_pages = index;
return COLLECT_PAGES_NEED_CONTINUE;
}
}
start += HPAGE_PMD_SIZE;
pme->phy_addr_array[index] = 0;
index++;
}
}
pme->nr_pages = index;
return COLLECT_PAGES_FINISH;
}
int collect_normal_pages(struct task_struct *task,
unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme)
{
int ret, res;
unsigned long next;
unsigned long i, nr_pages;
struct page *tmp_page;
unsigned long *phy_addr_array = pme->phy_addr_array;
unsigned int count;
unsigned long *pte_entry = pagemap_buffer;
struct mm_struct *mm = task->mm;
next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
next = (next > end_addr) ? end_addr : next;
pme->nr_pages = 0;
while (start_addr < next) {
count = 0;
nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE;
ret = pagemap_get(mm, pin_mem_pagewalk,
start_addr, next, pte_entry, &count);
if (ret || !count) {
pr_warn("Get user page fail: %d, count: %u.\n",
ret, count);
return COLLECT_PAGES_FAIL;
}
if (IS_PTE_PRESENT(pte_entry[0])) {
tmp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK);
/* If the page is compound head, goto collect huge pages. */
if (PageHead(tmp_page))
return COLLECT_PAGES_NEED_CONTINUE;
if (PageTail(tmp_page)) {
start_addr = next;
pme->virt_addr = start_addr;
next = NEXT_PIN_ADDR(next, end_addr);
continue;
}
}
for (i = 0; i < count; i++) {
if (pte_entry[i] & PM_SWAP) {
res = get_user_pages_remote(task->mm, start_addr + i * PAGE_SIZE,
1, FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL);
if (!res) {
pr_warn("Swap in page fail.\n");
return COLLECT_PAGES_FAIL;
}
phy_addr_array[i] = page_to_phys(tmp_page);
continue;
}
if (!IS_PTE_PRESENT(pte_entry[i])) {
phy_addr_array[i] = 0;
continue;
}
tmp_page = pfn_to_page(pte_entry[i] & PM_PFRAME_MASK);
SetPageHotreplace(tmp_page);
atomic_inc(&(tmp_page->_refcount));
phy_addr_array[i] = ((pte_entry[i] & PM_PFRAME_MASK) << PAGE_SHIFT);
}
pme->nr_pages += count;
phy_addr_array += count;
start_addr = next;
next = NEXT_PIN_ADDR(next, end_addr);
}
return COLLECT_PAGES_FINISH;
}
void free_pin_pages(struct page_map_entry *pme)
{
unsigned long i;
struct page *tmp_page;
if (!pme)
return;
for (i = 0; i < pme->nr_pages; i++) {
if (pme->phy_addr_array[i]) {
tmp_page = phys_to_page(pme->phy_addr_array[i]);
atomic_dec(&(tmp_page->_refcount));
pme->phy_addr_array[i] = 0;
}
}
}
int init_pagemap_read(void)
{
int ret = -ENOMEM;
if (pin_mem_pagewalk)
return 0;
mutex_lock(&pin_mem_mutex);
pin_mem_pagewalk = create_pagemap_walk();
if (!pin_mem_pagewalk)
goto out;
pagemap_buffer = kmalloc(((PMD_SIZE >> PAGE_SHIFT) + 1) *
sizeof(unsigned long), GFP_KERNEL);
if (!pagemap_buffer)
goto free;
ret = 0;
out:
mutex_unlock(&pin_mem_mutex);
return ret;
free:
free_pagemap_walk(pin_mem_pagewalk);
pin_mem_pagewalk = NULL;
goto out;
}
EXPORT_SYMBOL_GPL(init_pagemap_read);
/* Users make sure that the pin memory belongs to anonymous vma. */
int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
unsigned long start_addr, unsigned long end_addr)
{
int pid, ret;
int is_huge_page = false;
unsigned int page_size;
unsigned long nr_pages, flags;
struct page_map_entry *pme = NULL;
struct page_map_info *pmi;
struct vm_area_struct *vma;
unsigned long i;
struct page *tmp_page;
if (!page_map_entry_start
|| !task || !mm
|| start_addr >= end_addr || !pin_mem_pagewalk)
return -EFAULT;
pid = task->pid;
spin_lock_irqsave(&page_map_entry_lock, flags);
nr_pages = ((end_addr - start_addr) / PAGE_SIZE);
if ((unsigned long)page_map_entry_start +
nr_pages * sizeof(unsigned long) +
sizeof(struct page_map_entry) >= page_map_entry_end) {
pr_warn("Page map entry use up!\n");
ret = -ENOMEM;
goto finish;
}
vma = find_extend_vma(mm, start_addr);
if (!vma) {
pr_warn("Find no match vma!\n");
ret = -EFAULT;
goto finish;
}
if (start_addr == (start_addr & HPAGE_PMD_MASK) &&
transparent_hugepage_active(vma)) {
page_size = HPAGE_PMD_SIZE;
is_huge_page = true;
} else {
page_size = PAGE_SIZE;
}
pme = page_map_entry_start;
pme->virt_addr = start_addr;
pme->redirect_start = 0;
pme->is_huge_page = is_huge_page;
memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long));
down_read(&mm->mmap_lock);
if (!is_huge_page) {
ret = collect_normal_pages(task, start_addr, end_addr, pme);
if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) {
if (ret == COLLECT_PAGES_FINISH) {
ret = 0;
up_read(&mm->mmap_lock);
goto finish;
}
pme->is_huge_page = true;
page_size = HPAGE_PMD_SIZE;
ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme);
}
} else {
ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme);
if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) {
if (ret == COLLECT_PAGES_FINISH) {
ret = 0;
up_read(&mm->mmap_lock);
goto finish;
}
pme->is_huge_page = false;
page_size = PAGE_SIZE;
ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme);
}
}
up_read(&mm->mmap_lock);
if (ret == COLLECT_PAGES_FAIL) {
ret = -EFAULT;
goto finish;
}
/* check for zero pages */
for (i = 0; i < pme->nr_pages; i++) {
tmp_page = phys_to_page(pme->phy_addr_array[i]);
if (!pme->is_huge_page) {
if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE))
pme->phy_addr_array[i] = 0;
} else if (is_huge_zero_page(tmp_page))
pme->phy_addr_array[i] = 0;
}
page_map_entry_start = (struct page_map_entry *)(next_pme(pme));
pmi = get_page_map_info(pid);
if (!pmi)
pmi = create_page_map_info(pid);
if (!pmi) {
pr_warn("Create page map info fail for pid: %d!\n", pid);
ret = -EFAULT;
goto finish;
}
if (!pmi->pme)
pmi->pme = pme;
pmi->entry_num++;
spin_unlock_irqrestore(&page_map_entry_lock, flags);
if (ret == COLLECT_PAGES_NEED_CONTINUE)
ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr);
return ret;
finish:
if (ret)
free_pin_pages(pme);
spin_unlock_irqrestore(&page_map_entry_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(pin_mem_area);
vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma,
struct page_map_entry *pme)
{
int ret;
unsigned int j, i;
pgd_t *pgd;
p4d_t *p4d;
pmd_t *pmd;
pud_t *pud;
struct page *page, *new;
unsigned long address;
unsigned long phy_addr;
unsigned int redirect_pages = 0;
struct redirect_info *redirect_start;
redirect_start = (struct redirect_info *)pme->redirect_start;
for (j = 0; j < pme->nr_pages; j++) {
address = pme->virt_addr + j * PAGE_SIZE;
phy_addr = pme->phy_addr_array[j];
if (!phy_addr)
continue;
page = phys_to_page(phy_addr);
if (page_to_pfn(page) == my_zero_pfn(address)) {
pme->phy_addr_array[j] = 0;
continue;
}
pme->phy_addr_array[j] = 0;
if (redirect_start && (redirect_pages < redirect_start->redirect_pages) &&
(j == redirect_start->redirect_index[redirect_pages])) {
new = alloc_zeroed_user_highpage_movable(vma, address);
if (!new) {
pr_warn("Redirect alloc page fail\n");
continue;
}
copy_page(page_to_virt(new), phys_to_virt(phy_addr));
page = new;
redirect_pages++;
}
page->mapping = NULL;
pgd = pgd_offset(mm, address);
ret = VM_FAULT_OOM;
p4d = p4d_alloc(mm, pgd, address);
if (!p4d)
goto free;
pud = pud_alloc(mm, p4d, address);
if (!pud)
goto free;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
goto free;
ret = do_anon_page_remap(vma, address, pmd, page);
if (ret)
goto free;
ClearPageHotreplace(page);
}
return 0;
free:
ClearPageHotreplace(page);
for (i = j; i < pme->nr_pages; i++) {
phy_addr = pme->phy_addr_array[i];
if (phy_addr) {
put_page(phys_to_page(phy_addr));
pme->phy_addr_array[i] = 0;
}
}
return ret;
}
static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma)
{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
__GFP_KSWAPD_RECLAIM);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
0);
return GFP_TRANSHUGE_LIGHT;
}
vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma,
struct page_map_entry *pme)
{
int ret;
unsigned int j, i;
pgd_t *pgd;
p4d_t *p4d;
pmd_t *pmd;
pud_t *pud;
gfp_t gfp;
struct page *page, *new;
unsigned long address;
unsigned long phy_addr;
unsigned int redirect_pages = 0;
struct redirect_info *redirect_start;
redirect_start = (struct redirect_info *)pme->redirect_start;
for (j = 0; j < pme->nr_pages; j++) {
address = pme->virt_addr + j * HPAGE_PMD_SIZE;
phy_addr = pme->phy_addr_array[j];
if (!phy_addr)
continue;
page = phys_to_page(phy_addr);
if (is_huge_zero_page(page)) {
pme->phy_addr_array[j] = 0;
continue;
}
pme->phy_addr_array[j] = 0;
if (redirect_start && (redirect_pages < redirect_start->redirect_pages) &&
(j == redirect_start->redirect_index[redirect_pages])) {
gfp = get_hugepage_gfpmask(vma);
new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER);
if (!new) {
pr_warn("Redirect alloc huge page fail\n");
continue;
}
memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE);
page = new;
redirect_pages++;
}
pgd = pgd_offset(mm, address);
ret = VM_FAULT_OOM;
p4d = p4d_alloc(mm, pgd, address);
if (!p4d)
goto free;
pud = pud_alloc(mm, p4d, address);
if (!pud)
goto free;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
goto free;
ret = do_anon_huge_page_remap(vma, address, pmd, page);
if (ret)
goto free;
ClearPageHotreplace(page);
}
return 0;
free:
ClearPageHotreplace(page);
for (i = j; i < pme->nr_pages; i++) {
phy_addr = pme->phy_addr_array[i];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
put_page(page);
pme->phy_addr_array[i] = 0;
}
}
}
return ret;
}
static void free_unmap_pages(struct page_map_info *pmi,
struct page_map_entry *pme,
unsigned int index)
{
unsigned int i, j;
unsigned long phy_addr;
struct page *page;
pme = (struct page_map_entry *)(next_pme(pme));
for (i = index; i < pmi->entry_num; i++) {
for (j = 0; j < pme->nr_pages; j++) {
phy_addr = pme->phy_addr_array[i];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
put_page(page);
pme->phy_addr_array[i] = 0;
}
}
}
pme = (struct page_map_entry *)(next_pme(pme));
}
}
vm_fault_t do_mem_remap(int pid, struct mm_struct *mm)
{
unsigned int i = 0;
vm_fault_t ret = 0;
struct vm_area_struct *vma;
struct page_map_info *pmi;
struct page_map_entry *pme;
unsigned long flags;
if (reserve_user_map_pages_fail || !mm)
return -EFAULT;
spin_lock_irqsave(&page_map_entry_lock, flags);
pmi = get_page_map_info(pid);
if (pmi)
pmi->disable_free_page = true;
spin_unlock_irqrestore(&page_map_entry_lock, flags);
if (!pmi)
return -EFAULT;
down_write(&mm->mmap_lock);
pme = pmi->pme;
vma = mm->mmap;
while ((i < pmi->entry_num) && (vma != NULL)) {
if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) {
i++;
if (!vma_is_anonymous(vma)) {
pme = (struct page_map_entry *)(next_pme(pme));
continue;
}
if (!pme->is_huge_page) {
ret = remap_normal_pages(mm, vma, pme);
if (ret < 0)
goto free;
} else {
ret = remap_huge_pmd_pages(mm, vma, pme);
if (ret < 0)
goto free;
}
pme = (struct page_map_entry *)(next_pme(pme));
} else {
vma = vma->vm_next;
}
}
up_write(&mm->mmap_lock);
return 0;
free:
free_unmap_pages(pmi, pme, i);
up_write(&mm->mmap_lock);
return ret;
}
EXPORT_SYMBOL_GPL(do_mem_remap);
static void free_all_reserved_pages(void)
{
unsigned int i, j, index;
struct page_map_info *pmi;
struct page_map_entry *pme;
struct page *page;
unsigned long phy_addr;
if (!user_space_reserve_start || reserve_user_map_pages_fail)
return;
for (index = 0; index < pin_pid_num; index++) {
pmi = &(user_space_reserve_start[index]);
if (pmi->disable_free_page)
continue;
pme = pmi->pme;
for (i = 0; i < pmi->entry_num; i++) {
for (j = 0; j < pme->nr_pages; j++) {
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
ClearPageHotreplace(page);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
put_page(page);
pme->phy_addr_array[j] = 0;
}
}
}
pme = (struct page_map_entry *)next_pme(pme);
}
}
}
/* Clear all pin memory record. */
void clear_pin_memory_record(void)
{
unsigned long flags;
spin_lock_irqsave(&page_map_entry_lock, flags);
free_all_reserved_pages();
if (pin_pid_num_addr) {
*pin_pid_num_addr = 0;
pin_pid_num = 0;
page_map_entry_start = __page_map_entry_start;
}
spin_unlock_irqrestore(&page_map_entry_lock, flags);
}
EXPORT_SYMBOL_GPL(clear_pin_memory_record);
static struct resource pin_memory_resource = {
.name = "Pin memory",
.start = 0,
.end = 0,
.flags = IORESOURCE_MEM,
.desc = IORES_DESC_RESERVED
};
static unsigned long long pin_mem_start;
static unsigned long long pin_mem_len;
static int __init parse_pin_memory(char *cmdline)
{
char *cur = cmdline;
if (!cmdline)
return 0;
pin_mem_len = memparse(cmdline, &cur);
if (cmdline == cur) {
pr_warn("pinmem: memory value expected\n");
return -EINVAL;
}
if (*cur == '@')
pin_mem_start = memparse(cur+1, &cur);
else if (*cur != ' ' && *cur != '\0') {
pr_warn("pinmem: unrecognized char: %c\n", *cur);
return -EINVAL;
}
return 0;
}
early_param("pinmemory", parse_pin_memory);
void __init reserve_pin_memory_res(void)
{
unsigned long long mem_start = pin_mem_start;
unsigned long long mem_len = pin_mem_len;
if (!pin_mem_len)
return;
mem_len = PAGE_ALIGN(mem_len);
if (!memblock_is_region_memory(mem_start, mem_len)) {
pr_warn("cannot reserve for pin memory: region is not memory!\n");
return;
}
if (memblock_is_region_reserved(mem_start, mem_len)) {
pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n");
return;
}
memblock_reserve(mem_start, mem_len);
pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
mem_start, mem_start + mem_len, mem_len >> 20);
pin_memory_resource.start = mem_start;
pin_memory_resource.end = mem_start + mem_len - 1;
}
void request_pin_mem_res(struct resource *res)
{
if (pin_memory_resource.end &&
pin_memory_resource.start >= res->start &&
pin_memory_resource.end <= res->end)
request_resource(res, &pin_memory_resource);
}
void init_reserve_page_map(void)
{
void *addr;
unsigned long map_addr, map_size;
map_addr = (unsigned long)pin_memory_resource.start;
map_size = (unsigned long)(pin_memory_resource.end - pin_memory_resource.start + 1);
if (!map_addr || !map_size)
return;
addr = phys_to_virt(map_addr);
init_page_map_info((struct pin_mem_dump_info *)addr, map_size);
}
#endif /* CONFIG_PIN_MEMORY */
#ifdef CONFIG_PID_RESERVE
struct idr *reserve_idr;
void free_reserved_pid(struct idr *idr, int pid)
{
unsigned int index;
struct page_map_info *pmi;
if (!max_pin_pid_num || idr != reserve_idr)
return;
for (index = 0; index < pin_pid_num; index++) {
pmi = &(user_space_reserve_start[index]);
if (pmi->pid == pid && pmi->pid_reserved) {
idr_remove(idr, pid);
return;
}
}
}
/* reserve pids for check point tasks which pinned memory */
void reserve_pids(struct idr *idr, int pid_max)
{
int alloc_pid;
unsigned int index;
struct page_map_info *pmi;
if (!pin_pid_num || !max_pin_pid_num)
return;
reserve_idr = idr;
for (index = 0; index < pin_pid_num; index++) {
pmi = &(user_space_reserve_start[index]);
pmi->pid_reserved = true;
alloc_pid = idr_alloc(idr, NULL, pmi->pid, pid_max, GFP_ATOMIC);
if (alloc_pid != pmi->pid) {
if (alloc_pid > 0)
idr_remove(idr, alloc_pid);
pr_warn("Reserve pid (%d) fail, real pid is %d.\n", alloc_pid, pmi->pid);
pmi->pid_reserved = false;
continue;
}
}
}
#endif /* CONFIG_PID_RESERVE */