diff --git a/kernel/arch/riscv64/mmu.cpp b/kernel/arch/riscv64/mmu.cpp index ea3ec4e0b..005ff0e21 100644 --- a/kernel/arch/riscv64/mmu.cpp +++ b/kernel/arch/riscv64/mmu.cpp @@ -53,9 +53,6 @@ static unsigned long vm_prots_to_mmu(unsigned int prots) #define RISCV_MMU_FLAGS_TO_SAVE_ON_MPROTECT \ (RISCV_MMU_GLOBAL | RISCV_MMU_USER | RISCV_MMU_ACCESSED | RISCV_MMU_DIRTY | RISCV_MMU_SPECIAL) -void *paging_map_phys_to_virt(struct mm_address_space *as, uint64_t virt, uint64_t phys, - uint64_t prot, struct vm_area_struct *vma); - static inline void __native_tlb_invalidate_page(void *addr) { __asm__ __volatile__("sfence.vma %0, zero" ::"r"(addr)); @@ -71,9 +68,6 @@ static inline bool pte_special(u64 pte) return pte & RISCV_MMU_SPECIAL; } -bool riscv_get_pt_entry(void *addr, uint64_t **entry_ptr, bool may_create_path, - struct mm_address_space *mm); - unsigned long allocated_page_tables = 0; PML *alloc_pt(void) @@ -205,59 +199,6 @@ void paging_init(void) riscv_pt_page_mapping(VERYHUGE512GB_SIZE) | flags; } -void *paging_map_phys_to_virt(struct mm_address_space *as, uint64_t virt, uint64_t phys, - uint64_t prot, struct vm_area_struct *vma) -{ - bool user = prot & VM_USER; - - if (!as) - { - as = user ? get_current_address_space() : &kernel_address_space; - assert(as != nullptr); - } - - uint64_t *ptentry; - - if (!riscv_get_pt_entry((void *) virt, &ptentry, true, as)) - return nullptr; - - uint64_t page_prots = vm_prots_to_mmu(prot); - bool special_mapping = phys == (u64) page_to_phys(vm_get_zero_page()); - - if (special_mapping) - page_prots |= RISCV_MMU_SPECIAL; - - if (prot & VM_DONT_MAP_OVER && *ptentry & RISCV_MMU_VALID) - return (void *) virt; - - uint64_t old = *ptentry; - *ptentry = riscv_pt_page_mapping(phys) | page_prots; - if (pte_empty(old)) - { - increment_vm_stat(as, resident_set_size, PAGE_SIZE); - } - else - { - __native_tlb_invalidate_page((void *) PML_EXTRACT_ADDRESS(*ptentry)); - } - - if (!vma_is_pfnmap(vma)) - { - if (!pte_empty(old) && !pte_special(old)) - { - /* If old was a thing, decrement the mapcount */ - struct page *oldp = phys_to_page(PML_EXTRACT_ADDRESS(old)); - page_sub_mapcount(oldp); - } - - struct page *newp = phys_to_page(phys); - if (!special_mapping) - page_add_mapcount(newp); - } - - return (void *) virt; -} - bool pml_is_empty(const PML *pml) { for (int i = 0; i < 512; i++) @@ -269,52 +210,6 @@ bool pml_is_empty(const PML *pml) return true; } -struct pt_location -{ - PML *table; - unsigned int index; -}; - -bool riscv_get_pt_entry_with_ptables(void *addr, uint64_t **entry_ptr, struct mm_address_space *mm, - struct pt_location location[4]) -{ - unsigned long virt = (unsigned long) addr; - unsigned int indices[riscv_max_paging_levels]; - - for (unsigned int i = 0; i < riscv_paging_levels; i++) - { - indices[i] = (virt >> 12) >> (i * 9) & 0x1ff; - location[4 - 1 - i].index = indices[i]; - } - - PML *pml = (PML *) ((unsigned long) mm->arch_mmu.top_pt + PHYS_BASE); - unsigned int location_index = 0; - - for (unsigned int i = riscv_paging_levels; i != 1; i--) - { - uint64_t entry = pml->entries[indices[i - 1]]; - location[location_index].table = pml; - location[location_index++].index = indices[i - 1]; - - if (entry & RISCV_MMU_VALID) - { - void *page = (void *) PML_EXTRACT_ADDRESS(entry); - pml = (PML *) PHYS_TO_VIRT(page); - } - else - { - return false; - } - } - - location[location_index].table = pml; - location[location_index++].index = indices[0]; - - *entry_ptr = &pml->entries[indices[0]]; - - return true; -} - /** * @brief Clone the architecture specific part of an address space * @@ -355,83 +250,6 @@ static void dump_pt(PML *pt) printk("%016lx\n", entry); } -bool riscv_get_pt_entry(void *addr, uint64_t **entry_ptr, bool may_create_path, - struct mm_address_space *mm) -{ - unsigned long virt = (unsigned long) addr; - unsigned int indices[riscv_max_paging_levels]; - - addr_to_indices(virt, indices); - - PML *pml = (PML *) ((unsigned long) mm->arch_mmu.top_pt + PHYS_BASE); - - for (unsigned int i = riscv_paging_levels; i != 1; i--) - { - uint64_t entry = pml->entries[indices[i - 1]]; - if (entry & RISCV_MMU_VALID) - { - void *page = (void *) PML_EXTRACT_ADDRESS(entry); - pml = (PML *) PHYS_TO_VIRT(page); - } - else - { - if (!may_create_path) - return false; - - PML *pt = alloc_pt(); - - if (!pt) - return false; - increment_vm_stat(mm, page_tables_size, PAGE_SIZE); - - pml->entries[indices[i - 1]] = riscv_make_pt_entry_page_table(pt); - __asm__ __volatile__("sfence.vma zero, zero"); - - pml = (PML *) PHYS_TO_VIRT(pt); - } - } - - *entry_ptr = &pml->entries[indices[0]]; - - return true; -} - -bool __paging_change_perms(struct mm_address_space *mm, void *addr, int prot) -{ - MUST_HOLD_MUTEX(&mm->vm_lock); - - uint64_t *entry; - if (!riscv_get_pt_entry(addr, &entry, false, mm)) - { - return false; - } - - uint64_t pt_entry = *entry; - uint64_t perms = pt_entry & RISCV_MMU_FLAGS_TO_SAVE_ON_MPROTECT; - uint64_t page = PML_EXTRACT_ADDRESS(pt_entry); - - if (prot & VM_EXEC) - perms |= RISCV_MMU_EXECUTE; - if (prot & VM_WRITE) - perms |= RISCV_MMU_WRITE; - if (prot & VM_READ) - perms |= RISCV_MMU_VALID | RISCV_MMU_READ; - *entry = perms | page; - - return true; -} - -bool paging_write_protect(void *addr, struct mm_address_space *mm) -{ - uint64_t *ptentry; - if (!riscv_get_pt_entry(addr, &ptentry, false, mm)) - return false; - - *ptentry = *ptentry & ~RISCV_MMU_WRITE; - - return true; -} - int is_invalid_arch_range(void *address, size_t pages) { unsigned long addr = (unsigned long) address; @@ -499,22 +317,6 @@ void paging_invalidate(void *page, size_t pages) } } -/** - * @brief Directly maps a page into the paging tables. - * - * @param as The target address space. - * @param virt The virtual address. - * @param phys The physical address of the page. - * @param prot Desired protection flags. - * @param vma VMA for this mapping (optional) - * @return NULL if out of memory, else virt. - */ -void *vm_map_page(struct mm_address_space *as, uint64_t virt, uint64_t phys, uint64_t prot, - struct vm_area_struct *vma) -{ - return paging_map_phys_to_virt(as, virt, phys, prot, vma); -} - void paging_free_pml2(PML *pml) { for (int i = 0; i < 512; i++) @@ -566,42 +368,6 @@ void paging_free_page_tables(struct mm_address_space *mm) free_page(phys_to_page((unsigned long) mm->arch_mmu.top_pt)); } -unsigned long __get_mapping_info(void *addr, struct mm_address_space *as) -{ - unsigned long *ppt_entry; - // TODO: Recognize hugepages here - if (!riscv_get_pt_entry(addr, &ppt_entry, false, as)) - return PAGE_NOT_PRESENT; - - unsigned long pt_entry = *ppt_entry; - - unsigned long ret = 0; - - if (pt_entry & RISCV_MMU_VALID) - ret |= PAGE_PRESENT; - else - { - return PAGE_NOT_PRESENT; - } - - if (pt_entry & RISCV_MMU_USER) - ret |= PAGE_USER; - if (pt_entry & RISCV_MMU_WRITE) - ret |= PAGE_WRITABLE; - if (pt_entry & RISCV_MMU_EXECUTE) - ret |= PAGE_EXECUTABLE; - if (pt_entry & RISCV_MMU_DIRTY) - ret |= PAGE_DIRTY; - if (pt_entry & RISCV_MMU_ACCESSED) - ret |= PAGE_ACCESSED; - if (pt_entry & RISCV_MMU_GLOBAL) - ret |= PAGE_GLOBAL; - - ret |= PML_EXTRACT_ADDRESS(pt_entry); - - return ret; -} - /** * @brief Free the architecture dependent parts of the address space. * Called on address space destruction. @@ -633,158 +399,6 @@ void vm_save_current_mmu(struct mm_address_space *mm) mm->arch_mmu.top_pt = get_current_page_tables(); } -/** - * @brief Directly mprotect a page in the paging tables. - * Called by core MM code and should not be used outside of it. - * This function handles any edge cases like trying to re-apply write perms on - * a write-protected page. - * - * @param as The target address space. - * @param addr The virtual address of the page. - * @param old_prots The old protection flags. - * @param new_prots The new protection flags. - */ -void vm_mmu_mprotect_page(struct mm_address_space *as, void *addr, int old_prots, int new_prots) -{ - uint64_t *ptentry; - if (!riscv_get_pt_entry(addr, &ptentry, false, as)) - return; - - if (!*ptentry) - return; - - /* Make sure we don't accidentally mark a page as writable when - * it's write-protected and we're changing some other bits. - * For example: mprotect(PROT_EXEC) on a COW'd supposedly writable - * page would try to re-apply the writable permission. - */ - - /* In this function, we use the old_prots parameter to know whether it was a write-protected - * page. - */ - bool is_wp_page = !(*ptentry & RISCV_MMU_WRITE) && old_prots & VM_WRITE; - - if (is_wp_page) - { - new_prots &= ~VM_WRITE; - // printk("NOT VM_WRITING\n"); - } - - // printk("new prots: %x\n", new_prots); - - unsigned long paddr = PML_EXTRACT_ADDRESS(*ptentry); - - uint64_t page_prots = vm_prots_to_mmu(new_prots); - *ptentry = riscv_pt_page_mapping(paddr) | page_prots; -} - -class page_table_iterator -{ -private: - unsigned long curr_addr_; - size_t length_; - -public: - struct mm_address_space *as_; - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - bool debug; -#endif - - page_table_iterator(unsigned long virt, size_t len, struct mm_address_space *as) - : curr_addr_{virt}, length_{len}, as_{as} - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - , - debug{false} -#endif - - { - } - - size_t length() const - { - return length_; - } - - unsigned long curr_addr() const - { - return curr_addr_; - } - - void adjust_length(size_t size) - { - if (size > length_) - { - length_ = 0; - curr_addr_ += length_; - } - else - { - length_ -= size; - curr_addr_ += size; - } - } -}; - -struct tlb_invalidation_tracker -{ - unsigned long virt_start; - unsigned long virt_end; - bool is_started, is_flushed; - - explicit tlb_invalidation_tracker() : virt_start{}, virt_end{}, is_started{}, is_flushed{} - { - } - - void invalidate_tracker() - { - virt_start = 0xDEADDAD; - virt_end = 0xB0; - is_started = false; - is_flushed = false; - } - - void flush() - { - if (!is_started) - return; - - vm_invalidate_range(virt_start, (virt_end - virt_start) >> PAGE_SHIFT); - invalidate_tracker(); - } - - constexpr void init(unsigned long vaddr, size_t size) - { - is_started = true; - virt_start = vaddr; - virt_end = vaddr + size; - is_flushed = false; - } - - void add_page(unsigned long vaddr, size_t size) - { - /* If we've already started on a run of pages and this one is contiguous, just set the tail - */ - if (is_started && virt_end == vaddr) - { - virt_end = vaddr + size; - } - else - { - /* Else, try flushing if is_started == true and restart the page run */ - flush(); - init(vaddr, size); - } - } - - ~tlb_invalidation_tracker() - { - if (is_started && !is_flushed) - flush(); - } -}; - enum page_table_levels : unsigned int { PT_LEVEL, @@ -815,135 +429,6 @@ constexpr unsigned int addr_get_index(unsigned long virt, unsigned int pt_level) return (virt >> 12) >> (pt_level * 9) & 0x1ff; } -#define MMU_UNMAP_CAN_FREE_PML 1 -#define MMU_UNMAP_OK 0 - -static int riscv_mmu_unmap(PML *table, unsigned int pt_level, page_table_iterator &it, - struct vm_area_struct *vma) -{ - unsigned int index = addr_get_index(it.curr_addr(), pt_level); - - /* Get the size that each entry represents here */ - auto entry_size = level_to_entry_size(pt_level); - - tlb_invalidation_tracker invd_tracker; - unsigned int i; - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - if (it.debug) - { - printk("level %u - index %x\n", pt_level, index); - } -#endif - - for (i = index; i < PAGE_TABLE_ENTRIES && it.length(); i++) - { - auto &pt_entry = table->entries[i]; - bool is_pte_empty = pte_empty(pt_entry); - - if (is_pte_empty) - { - -#ifdef CONFIG_RISCV_MMU_UNMAP_DEBUG - if (it.debug) - printk("not present @ level %u\nentry size %lu\nlength %lu\n", pt_level, entry_size, - it.length()); -#endif - auto to_skip = entry_size - (it.curr_addr() & (entry_size - 1)); - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - if (it.debug) - { - printk("[level %u]: Skipping from %lx to %lx\n", pt_level, it.curr_addr(), - it.curr_addr() + to_skip); - } -#endif - - it.adjust_length(to_skip); - continue; - } - - bool is_huge_page = is_huge_page_level(pt_level) && pt_entry_is_huge(pt_entry); - - if (pt_level == PT_LEVEL || is_huge_page) - { - /* TODO: Handle huge page splitting */ - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - if (it.debug) - printk("Unmapping %lx\n", it.curr_addr()); -#endif - - unsigned long val = 0; - __atomic_exchange(&pt_entry, &val, &val, __ATOMIC_RELEASE); - - if (val & RISCV_MMU_ACCESSED) - invd_tracker.add_page(it.curr_addr(), entry_size); - - if (!vma_is_pfnmap(vma) && !pte_special(val)) - { - struct page *oldp = phys_to_page(PML_EXTRACT_ADDRESS(val)); - page_sub_mapcount(oldp); - } - - it.adjust_length(entry_size); - decrement_vm_stat(it.as_, resident_set_size, entry_size); - } - else - { - assert((pt_entry & RISCV_MMU_VALID) != 0); - PML *next_table = (PML *) PHYS_TO_VIRT(PML_EXTRACT_ADDRESS(pt_entry)); - int st = riscv_mmu_unmap(next_table, pt_level - 1, it, vma); - - if (st == MMU_UNMAP_CAN_FREE_PML) - { - auto page = phys_to_page(PML_EXTRACT_ADDRESS(pt_entry)); - - pt_entry = 0; - - COMPILER_BARRIER(); - - free_page(page); - __atomic_sub_fetch(&allocated_page_tables, 1, __ATOMIC_RELAXED); - decrement_vm_stat(it.as_, page_tables_size, PAGE_SIZE); - } - } - } - - /* We can know that the table is 100% empty if we ran through the table */ - bool unmapped_whole_table = index == 0 && i == PAGE_TABLE_ENTRIES; - - /* Don't bother to free the PML or even check if it's empty if we're the top paging structure */ - if (pt_level != riscv_paging_levels - 1 && (unmapped_whole_table || pml_is_empty(table))) - { - return MMU_UNMAP_CAN_FREE_PML; - } - -#if 0 - printk("nr entries %lu\n", nr_entries); - - printk("unmapping %lu\n", it.length()); -#endif - - return MMU_UNMAP_OK; -} - -int vm_mmu_unmap(struct mm_address_space *as, void *addr, size_t pages, struct vm_area_struct *vma) -{ - unsigned long virt = (unsigned long) addr; - size_t size = pages << PAGE_SHIFT; - - page_table_iterator it{virt, size, as}; - - PML *first_level = (PML *) PHYS_TO_VIRT(as->arch_mmu.top_pt); - - riscv_mmu_unmap(first_level, riscv_paging_levels - 1, it, vma); - - assert(it.length() == 0); - - return 0; -} - static inline bool is_higher_half(unsigned long address) { return address >= VM_HIGHER_HALF; @@ -1022,14 +507,12 @@ static void mmu_acct_page_table(PML *pt, page_table_levels level, mmu_acct &acct { acct.page_table_size += PAGE_SIZE; - for (const auto pte : pt->entries) + for (int i = 0; i < (level == PML4_LEVEL ? 256 : 512); i++) { + u64 pte = pt->entries[i]; if (pte_empty(pte)) continue; - if (!(pte & RISCV_MMU_USER)) - continue; - if (level != PT_LEVEL) { mmu_acct_page_table((PML *) PHYS_TO_VIRT(PML_EXTRACT_ADDRESS(pte)), @@ -1056,147 +539,3 @@ void mmu_verify_address_space_accounting(mm_address_space *as) assert(acct.page_table_size == as->page_tables_size); assert(acct.resident_set_size == as->resident_set_size); } - -static int riscv_mmu_fork(PML *parent_table, PML *child_table, unsigned int pt_level, - page_table_iterator &it, struct vm_area_struct *old_region) -{ - unsigned int index = addr_get_index(it.curr_addr(), pt_level); - - /* Get the size that each entry represents here */ - auto entry_size = level_to_entry_size(pt_level); - - unsigned int i; - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - if (it.debug) - { - printk("level %u - index %x\n", pt_level, index); - } -#endif - tlb_invalidation_tracker invd_tracker; - - for (i = index; i < PAGE_TABLE_ENTRIES && it.length(); i++) - { - const u64 pt_entry = parent_table->entries[i]; - bool pte_empty = pt_entry == 0; - - if (pte_empty) - { - -#ifdef CONFIG_X86_MMU_UNMAP_DEBUG - if (it.debug) - printk("not present @ level %u\nentry size %lu\nlength %lu\n", pt_level, entry_size, - it.length()); -#endif - auto to_skip = entry_size - (it.curr_addr() & (entry_size - 1)); - -#ifdef CONFIG_PT_ITERATOR_HAVE_DEBUG - if (it.debug) - { - printk("[level %u]: Skipping from %lx to %lx\n", pt_level, it.curr_addr(), - it.curr_addr() + to_skip); - } -#endif - - it.adjust_length(to_skip); - continue; - } - - bool is_huge_page = is_huge_page_level(pt_level) && pt_entry_is_huge(pt_entry); - - if (pt_level == PT_LEVEL || is_huge_page) - { - const bool should_cow = old_region->vm_maptype == MAP_PRIVATE; - child_table->entries[i] = pt_entry & (should_cow ? ~RISCV_MMU_WRITE : ~0UL); - if (!vma_is_pfnmap(old_region) && !pte_special(pt_entry)) - page_add_mapcount(phys_to_page(PML_EXTRACT_ADDRESS(pt_entry))); - if (should_cow) - { - /* Write-protect the parent's page too. Make sure to invalidate the TLB if we - * downgraded permissions. - */ - __atomic_store_n(&parent_table->entries[i], pt_entry & ~RISCV_MMU_WRITE, - __ATOMIC_RELAXED); - - if (pt_entry & RISCV_MMU_WRITE) - invd_tracker.add_page(it.curr_addr(), entry_size); - } - - increment_vm_stat(it.as_, resident_set_size, entry_size); - it.adjust_length(entry_size); - } - else - { - assert((pt_entry & RISCV_MMU_VALID) != 0); - - PML *old = (PML *) PHYS_TO_VIRT(PML_EXTRACT_ADDRESS(pt_entry)); - PML *child_pt = (PML *) PHYS_TO_VIRT(PML_EXTRACT_ADDRESS(child_table->entries[i])); - - if (child_table->entries[i] != 0) - { - /* Allocate a new page table for the child process */ - PML *copy = (PML *) alloc_pt(); - if (!copy) - return -ENOMEM; - - increment_vm_stat(it.as_, page_tables_size, PAGE_SIZE); - - const unsigned long old_prots = pt_entry & RISCV_PAGING_PROT_BITS; - /* Set the PTE */ - child_table->entries[i] = (unsigned long) copy | old_prots; - child_pt = (PML *) PHYS_TO_VIRT(copy); - } - - int st = riscv_mmu_fork(old, child_pt, pt_level - 1, it, old_region); - - if (st < 0) - { - return st; - } - } - } - - return 0; -} - -/** - * @brief Fork MMU page tables - * - * @param old_region Old vm_area_struct - * @param addr_space Current address space - * @return 0 on success, negative error codes - */ -int mmu_fork_tables(struct vm_area_struct *old_region, struct mm_address_space *addr_space) -{ - page_table_iterator it{old_region->vm_start, vma_pages(old_region) << PAGE_SHIFT, addr_space}; - - return riscv_mmu_fork((PML *) PHYS_TO_VIRT(old_region->vm_mm->arch_mmu.top_pt), - (PML *) PHYS_TO_VIRT(addr_space->arch_mmu.top_pt), - riscv_paging_levels - 1, it, old_region); -} - -unsigned int mmu_get_clear_referenced(struct mm_address_space *mm, void *addr, struct page *page) -{ - scoped_lock g{mm->page_table_lock}; - - u64 *ptep; - if (!riscv_get_pt_entry(addr, &ptep, false, mm)) - return 0; - - u64 pte = READ_ONCE(*ptep); - u64 new_pte; - do - { - if (!(pte & RISCV_MMU_ACCESSED)) - return 0; - if (PML_EXTRACT_ADDRESS(pte) != (unsigned long) page_to_phys(page)) - return 0; - new_pte = pte & ~RISCV_MMU_ACCESSED; - } while (!__atomic_compare_exchange_n(ptep, &pte, new_pte, false, __ATOMIC_RELAXED, - __ATOMIC_RELAXED)); - /* Architectural note: We don't need to flush the TLB. Flushing the TLB is required by riscv if - * we want the A bit to be set again, but we can just wait for an unrelated TLB flush (e.g - * context switch) to do the job for us. A TLB shootdown is too much overhead for this purpose. - */ - return 1; -} diff --git a/kernel/include/onyx/mm/pgtable-nop4d.h b/kernel/include/onyx/mm/pgtable-nop4d.h new file mode 100644 index 000000000..da7d822da --- /dev/null +++ b/kernel/include/onyx/mm/pgtable-nop4d.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Pedro Falcato + * This file is part of Onyx, and is released under the terms of the MIT License + * check LICENSE at the root directory for more information + * + * SPDX-License-Identifier: MIT + */ +#ifndef _ONYX_PGTABLE_NOP4D_H +#define _ONYX_PGTABLE_NOP4D_H + +#define PTRS_PER_P4D 1 + +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long addr) +{ + return (p4d_t *) pgd; +} + +static inline bool pgd_none(pgd_t pgd) +{ + return false; +} + +static inline bool pgd_present(pgd_t pgd) +{ + return true; +} + +static inline bool p4d_folded(void) +{ + return true; +} + +#endif diff --git a/kernel/include/onyx/riscv/include/platform/pgtable.h b/kernel/include/onyx/riscv/include/platform/pgtable.h new file mode 100644 index 000000000..d9d94e7d7 --- /dev/null +++ b/kernel/include/onyx/riscv/include/platform/pgtable.h @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2024 Pedro Falcato + * This file is part of Onyx, and is released under the terms of the MIT License + * check LICENSE at the root directory for more information + * + * SPDX-License-Identifier: MIT + */ +#ifndef _ONYX_PGTABLE_ARCH_H +#define _ONYX_PGTABLE_ARCH_H + +#include + +#include +#include +#include +#include + +__BEGIN_CDECLS + +typedef u64 pgdval_t; +typedef u64 p4dval_t; +typedef u64 pudval_t; +typedef u64 pmdval_t; +typedef u64 pteval_t; +typedef u64 pgprotval_t; + +#define PTE_GET_ADDR(n) ((n >> 10) << 12) +#define _PAGE_PRESENT (1 << 0) +#define _PAGE_READ (1 << 1) +#define _PAGE_WRITE (1 << 2) +#define _PAGE_EXEC (1 << 3) +#define _PAGE_USER (1 << 4) +#define _PAGE_GLOBAL (1 << 5) +#define _PAGE_ACCESSED (1 << 6) +#define _PAGE_DIRTY (1 << 7) +/* Use one of the ignored bits as SPECIAL. This will annotate zero page mappings (so we don't + * increment mapcount on zero_page and thus blow it up). add_mapcount and sub_mapcount will not be + * called on these struct pages. */ +#define _PAGE_SPECIAL (1 << 8) + +#define _PAGE_HUGE (_PAGE_WRITE | _PAGE_EXEC | _PAGE_READ) + +typedef struct pgd +{ + pgdval_t pgd; +} pgd_t; + +typedef struct p4d +{ + p4dval_t p4d; +} p4d_t; + +typedef struct pud +{ + pudval_t pud; +} pud_t; + +typedef struct pmd +{ + pmdval_t pmd; +} pmd_t; + +typedef struct pte +{ + pteval_t pte; +} pte_t; + +typedef struct pgprot +{ + pgprotval_t pgprot; +} pgprot_t; + +#define PTRS_PER_PGD 512 +#define PGD_SHIFT 39 + +#define PTRS_PER_P4D 1 +#define P4D_SHIFT 39 + +#define PTRS_PER_PUD 512 +#define PUD_SHIFT 30 + +#define PTRS_PER_PMD 512 +#define PMD_SHIFT 21 + +#define PTRS_PER_PTE 512 +#define PTE_SHIFT 12 + +#define __tovirt(x) (void *) (((uintptr_t) (x)) + PHYS_BASE) + +static inline unsigned long pgd_index(unsigned long addr) +{ + return (addr >> PGD_SHIFT) & (PTRS_PER_PGD - 1); +} + +static inline pgd_t *pgd_offset(struct mm_address_space *mm, unsigned long addr) +{ + return (pgd_t *) __tovirt(mm->arch_mmu.top_pt) + pgd_index(addr); +} + +#define pgd_val(x) ((x).pgd) +#define p4d_val(x) ((x).p4d) +#define pud_val(x) ((x).pud) +#define pmd_val(x) ((x).pmd) +#define pte_val(x) ((x).pte) +#define pgprot_val(x) ((x).pgprot) + +#define __pgd(x) ((pgd_t){(x)}) +#define __p4d(x) ((p4d_t){(x)}) +#define __pud(x) ((pud_t){(x)}) +#define __pmd(x) ((pmd_t){(x)}) +#define __pte(x) ((pte_t){(x)}) +#define __pgprot(x) ((pgprot_t){(x)}) + +static inline unsigned long p4d_index(unsigned long addr) +{ + return (addr >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + +static inline unsigned long pgd_addr(pgd_t pgd) +{ + return PTE_GET_ADDR(pgd_val(pgd)); +} + +static inline unsigned long pud_index(unsigned long addr) +{ + return (addr >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline unsigned long p4d_addr(p4d_t pgd) +{ + return PTE_GET_ADDR(p4d_val(pgd)); +} + +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long addr) +{ + return (pud_t *) __tovirt(p4d_addr(*p4d)) + pud_index(addr); +} + +static inline unsigned long pmd_index(unsigned long addr) +{ + return (addr >> PMD_SHIFT) & (PTRS_PER_PMD - 1); +} + +static inline unsigned long pud_addr(pud_t pgd) +{ + return PTE_GET_ADDR(pud_val(pgd)); +} + +static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) +{ + return (pmd_t *) __tovirt(pud_addr(*pud)) + pmd_index(addr); +} + +static inline unsigned long pte_index(unsigned long addr) +{ + return (addr >> PTE_SHIFT) & (PTRS_PER_PTE - 1); +} + +static inline unsigned long pmd_addr(pmd_t pgd) +{ + return PTE_GET_ADDR(pmd_val(pgd)); +} + +static inline pte_t *pte_offset(pmd_t *pmd, unsigned long addr) +{ + return (pte_t *) __tovirt(pmd_addr(*pmd)) + pte_index(addr); +} + +static inline unsigned long pte_addr(pte_t pgd) +{ + return PTE_GET_ADDR(pte_val(pgd)); +} + +static inline bool p4d_none(p4d_t p4d) +{ + return p4d_val(p4d) == 0; +} + +static inline bool pud_none(pud_t pud) +{ + return pud_val(pud) == 0; +} + +static inline bool pmd_none(pmd_t pmd) +{ + return pmd_val(pmd) == 0; +} + +static inline bool pte_none(pte_t pte) +{ + return pte_val(pte) == 0; +} + +static inline bool p4d_present(p4d_t p4d) +{ + return p4d_val(p4d) & _PAGE_PRESENT; +} + +static inline bool pud_present(pud_t pud) +{ + return pud_val(pud) & _PAGE_PRESENT; +} + +static inline bool pmd_present(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PRESENT; +} + +static inline bool pte_present(pte_t pte) +{ + return pte_val(pte) & _PAGE_PRESENT; +} + +#define KERNEL_PGTBL (_PAGE_PRESENT) +#define USER_PGTBL (KERNEL_PGTBL) + +static inline pte_t pte_mkpte(u64 phys, pgprot_t prot) +{ + return __pte(((phys >> PAGE_SHIFT) << 10) | pgprot_val(prot)); +} + +static inline pmd_t pmd_mkpmd(u64 phys, pgprot_t prot) +{ + return __pmd(((phys >> PAGE_SHIFT) << 10) | pgprot_val(prot)); +} + +static inline pud_t pud_mkpud(u64 phys, pgprot_t prot) +{ + return __pud(((phys >> PAGE_SHIFT) << 10) | pgprot_val(prot)); +} + +static inline p4d_t p4d_mkp4d(u64 phys, pgprot_t prot) +{ + return __p4d(((phys >> PAGE_SHIFT) << 10) | pgprot_val(prot)); +} + +static inline pgd_t pgd_mkpgd(u64 phys, pgprot_t prot) +{ + return __pgd(((phys >> PAGE_SHIFT) << 10) | pgprot_val(prot)); +} + +static inline bool pte_special(pte_t pte) +{ + return pte_val(pte) & _PAGE_SPECIAL; +} + +static inline bool pte_accessed(pte_t pte) +{ + return pte_val(pte) & _PAGE_ACCESSED; +} + +static inline bool pte_user(pte_t pte) +{ + return pte_val(pte) & _PAGE_USER; +} + +static inline bool pte_write(pte_t pte) +{ + return pte_val(pte) & _PAGE_WRITE; +} + +static inline bool pte_exec(pte_t pte) +{ + return pte_val(pte) & _PAGE_EXEC; +} + +static inline bool pte_dirty(pte_t pte) +{ + return pte_val(pte) & _PAGE_DIRTY; +} + +static inline bool pte_global(pte_t pte) +{ + return pte_val(pte) & _PAGE_GLOBAL; +} + +static void set_pgd(pgd_t *pgd, pgd_t val) +{ + WRITE_ONCE(pgd_val(*pgd), pgd_val(val)); +} +#define set_pgd set_pgd + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); +} + +/* PML4-level hugepages not supported on x86, for now... */ +#define ARCH_HUGE_PUD_SUPPORT 1 +#define ARCH_HUGE_PMD_SUPPORT 1 + +static inline bool pud_huge(pud_t pud) +{ + return pud_val(pud) & _PAGE_HUGE; +} + +static inline bool pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_HUGE; +} + +static inline bool pud_user(pud_t pud) +{ + return pud_val(pud) & _PAGE_USER; +} + +static inline bool pud_write(pud_t pud) +{ + return pud_val(pud) & _PAGE_WRITE; +} + +static inline bool pud_exec(pud_t pud) +{ + return pud_val(pud) & _PAGE_EXEC; +} + +static inline bool pud_dirty(pud_t pud) +{ + return pud_val(pud) & _PAGE_DIRTY; +} + +static inline bool pud_accessed(pud_t pud) +{ + return pud_val(pud) & _PAGE_ACCESSED; +} + +static inline bool pud_global(pud_t pud) +{ + return pud_val(pud) & _PAGE_GLOBAL; +} + +static inline bool pmd_user(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_USER; +} + +static inline bool pmd_write(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_WRITE; +} + +static inline bool pmd_exec(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_EXEC; +} + +static inline bool pmd_dirty(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_DIRTY; +} + +static inline bool pmd_accessed(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_ACCESSED; +} + +static inline bool pmd_global(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_GLOBAL; +} + +#define pud_folded() (0) +#define pmd_folded() (0) + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_WRITE); +} + +static inline pgprot_t calc_pgprot(u64 phys, u64 prots) +{ + bool special_mapping = phys == (u64) page_to_phys(vm_get_zero_page()); + pgprotval_t page_prots = (prots & VM_EXEC ? _PAGE_EXEC : 0) | + (prots & VM_WRITE ? _PAGE_WRITE : 0) | + (prots & (VM_READ | VM_WRITE) ? _PAGE_READ : 0) | + (prots & VM_USER ? _PAGE_USER : _PAGE_GLOBAL) | _PAGE_PRESENT | + (special_mapping ? _PAGE_SPECIAL : 0); + + if (!(prots & (VM_READ | VM_WRITE | VM_EXEC))) + page_prots &= ~_PAGE_PRESENT; + + return __pgprot(page_prots); +} + +#include + +__END_CDECLS + +#endif diff --git a/kernel/kernel/mm/pgtable-arch.h b/kernel/include/onyx/x86/include/platform/pgtable.h similarity index 82% rename from kernel/kernel/mm/pgtable-arch.h rename to kernel/include/onyx/x86/include/platform/pgtable.h index 60fe1616b..393026ba8 100644 --- a/kernel/kernel/mm/pgtable-arch.h +++ b/kernel/include/onyx/x86/include/platform/pgtable.h @@ -1,3 +1,10 @@ +/* + * Copyright (c) 2024 Pedro Falcato + * This file is part of Onyx, and is released under the terms of the MIT License + * check LICENSE at the root directory for more information + * + * SPDX-License-Identifier: MIT + */ #ifndef _ONYX_PGTABLE_ARCH_H #define _ONYX_PGTABLE_ARCH_H @@ -6,6 +13,8 @@ #include #include #include +#include +#include __BEGIN_CDECLS @@ -82,11 +91,6 @@ extern int pgd_shift, p4d_ptrs; #define PTRS_PER_PTE 512 #define PTE_SHIFT 12 -extern unsigned long __x86_phys_base; -extern unsigned long __x86_phys_base_limit; -#define PHYS_BASE __x86_phys_base -#define PHYS_BASE_LIMIT __x86_phys_base_limit - #define __tovirt(x) (void *) (((uintptr_t) (x)) + PHYS_BASE) static inline bool pml5_present(void) @@ -247,6 +251,26 @@ static inline pte_t pte_mkpte(u64 phys, pgprot_t prot) return __pte(phys | pgprot_val(prot)); } +static inline pmd_t pmd_mkpmd(u64 phys, pgprot_t prot) +{ + return __pmd(phys | pgprot_val(prot)); +} + +static inline pud_t pud_mkpud(u64 phys, pgprot_t prot) +{ + return __pud(phys | pgprot_val(prot)); +} + +static inline p4d_t p4d_mkp4d(u64 phys, pgprot_t prot) +{ + return __p4d(phys | pgprot_val(prot)); +} + +static inline pgd_t pgd_mkpgd(u64 phys, pgprot_t prot) +{ + return __pgd(phys | pgprot_val(prot)); +} + static inline bool pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; @@ -380,6 +404,26 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_WRITE); } +#define X86_CACHING_BITS(index) ((((index) & 0x3) << 3) | (((index >> 2) & 1) << 7)) + +static inline pgprot_t calc_pgprot(u64 phys, u64 prot) +{ + bool user = prot & VM_USER; + bool noexec = !(prot & VM_EXEC); + bool global = !user; + bool write = prot & VM_WRITE; + bool readable = prot & (VM_READ | VM_WRITE) || !noexec; + unsigned int cache_type = vm_prot_to_cache_type(prot); + uint8_t caching_bits = cache_to_paging_bits(cache_type); + bool special_mapping = phys == (u64) page_to_phys(vm_get_zero_page()); + + pgprotval_t page_prots = (noexec ? _PAGE_NX : 0) | (global ? _PAGE_GLOBAL : 0) | + (user ? _PAGE_USER : 0) | (write ? _PAGE_WRITE : 0) | + X86_CACHING_BITS(caching_bits) | (readable ? _PAGE_PRESENT : 0) | + (special_mapping ? _PAGE_SPECIAL : 0); + return __pgprot(page_prots); +} + __END_CDECLS #endif diff --git a/kernel/kernel/mm/Makefile b/kernel/kernel/mm/Makefile index 65cd609a8..02bad42a3 100644 --- a/kernel/kernel/mm/Makefile +++ b/kernel/kernel/mm/Makefile @@ -1,6 +1,7 @@ mm-y:= bootmem.o page.o pagealloc.o vm_object.o vm.o vmalloc.o reclaim.o amap.o anon.o mincore.o page_lru.o mm-$(CONFIG_KUNIT)+= vm_tests.o mm-$(CONFIG_X86)+= memory.o +mm-$(CONFIG_RISCV)+= memory.o ifeq ($(CONFIG_KASAN), y) obj-y_NOKASAN+= kernel/mm/asan/asan.o kernel/mm/asan/quarantine.o diff --git a/kernel/kernel/mm/memory.c b/kernel/kernel/mm/memory.c index 44ab7472d..d65e6ee4c 100644 --- a/kernel/kernel/mm/memory.c +++ b/kernel/kernel/mm/memory.c @@ -7,7 +7,6 @@ */ #include #include -#include #include "pgtable.h" @@ -28,7 +27,7 @@ p4d_t *p4d_alloc(pgd_t *pgd, unsigned long addr, struct mm_address_space *mm) p4d_t *p4d = __p4d_alloc(mm); if (!p4d) return NULL; - set_pgd(pgd, __pgd(perms | (unsigned long) p4d)); + set_pgd(pgd, pgd_mkpgd((unsigned long) p4d, __pgprot(perms))); return (p4d_t *) __tovirt(p4d) + p4d_index(addr); } @@ -56,7 +55,7 @@ pud_t *pud_alloc(p4d_t *p4d, unsigned long addr, struct mm_address_space *mm) pud_t *pud = __pud_alloc(mm); if (!pud) return NULL; - set_p4d(p4d, __p4d(perms | (unsigned long) pud)); + set_p4d(p4d, p4d_mkp4d((unsigned long) pud, __pgprot(perms))); return (pud_t *) __tovirt(pud) + pud_index(addr); } @@ -84,7 +83,7 @@ pmd_t *pmd_alloc(pud_t *pud, unsigned long addr, struct mm_address_space *mm) pmd_t *pmd = __pmd_alloc(mm); if (!pmd) return NULL; - set_pud(pud, __pud(perms | (unsigned long) pmd)); + set_pud(pud, pud_mkpud((unsigned long) pmd, __pgprot(perms))); return (pmd_t *) __tovirt(pmd) + pmd_index(addr); } @@ -112,7 +111,7 @@ pte_t *pte_alloc(pmd_t *pmd, unsigned long addr, struct mm_address_space *mm) pte_t *pte = __pte_alloc(mm); if (!pte) return NULL; - set_pmd(pmd, __pmd(perms | (unsigned long) pte)); + set_pmd(pmd, pmd_mkpmd((unsigned long) pte, __pgprot(perms))); return (pte_t *) __tovirt(pte) + pte_index(addr); } @@ -123,26 +122,6 @@ static pte_t *pte_get_or_alloc(pmd_t *pmd, unsigned long addr, struct mm_address return pte_alloc(pmd, addr, mm); } -#define X86_CACHING_BITS(index) ((((index) &0x3) << 3) | (((index >> 2) & 1) << 7)) - -static pgprot_t calc_pgprot(u64 phys, u64 prot) -{ - bool user = prot & VM_USER; - bool noexec = !(prot & VM_EXEC); - bool global = !user; - bool write = prot & VM_WRITE; - bool readable = prot & (VM_READ | VM_WRITE) || !noexec; - unsigned int cache_type = vm_prot_to_cache_type(prot); - uint8_t caching_bits = cache_to_paging_bits(cache_type); - bool special_mapping = phys == (u64) page_to_phys(vm_get_zero_page()); - - pgprotval_t page_prots = (noexec ? _PAGE_NX : 0) | (global ? _PAGE_GLOBAL : 0) | - (user ? _PAGE_USER : 0) | (write ? _PAGE_WRITE : 0) | - X86_CACHING_BITS(caching_bits) | (readable ? _PAGE_PRESENT : 0) | - (special_mapping ? _PAGE_SPECIAL : 0); - return __pgprot(page_prots); -} - /** * @brief Directly maps a page into the paging tables. * diff --git a/kernel/kernel/mm/pgtable.h b/kernel/kernel/mm/pgtable.h index a44d35410..a06732a32 100644 --- a/kernel/kernel/mm/pgtable.h +++ b/kernel/kernel/mm/pgtable.h @@ -1,7 +1,7 @@ #ifndef _ONYX_PGTABLE_H #define _ONYX_PGTABLE_H -#include "pgtable-arch.h" +#include #ifndef set_pgd static void set_pgd(pgd_t *pgd, pgd_t val) diff --git a/kernel/kernel/mm/vm.cpp b/kernel/kernel/mm/vm.cpp index 3a19347bb..4111f2243 100644 --- a/kernel/kernel/mm/vm.cpp +++ b/kernel/kernel/mm/vm.cpp @@ -1198,7 +1198,7 @@ static void vm_mprotect_handle_prot(struct vm_area_struct *region, int *pprot) } } -#ifndef CONFIG_X86 +#if !defined(CONFIG_X86) && !defined(CONFIG_RISCV) /* TODO: Remove once all architectures have been moved to the new shared page table code */ void vm_do_mmu_mprotect(struct mm_address_space *as, void *address, size_t nr_pgs, int old_prots, int new_prots)