The kernel is compiled with the following Rust target:
Code: Select all
{
"arch": "x86_64",
"code-model": "kernel",
"cpu": "x86-64",
"data-layout": "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
"disable-redzone": true,
"eh-frame-header": true,
"exe-suffix": ".elf",
"executables": true,
"features": "-mmx,-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-3dnow,-3dnowa,-avx,-avx2,+soft-float",
"linker": "rust-lld",
"linker-flavor": "ld.lld",
"llvm-target": "x86_64-unknown-none-elf",
"max-atomic-width": 64,
"os": "none",
"panic-strategy": "unwind",
"position-independent-executables": true,
"relro-level": "full",
"target-c-int-width": "32",
"target-endian": "little",
"target-pointer-width": "64"
}
I have also tried to implement the way Linux does virtual memory mapping, according to the following table:
Code: Select all
========================================================================================================================
Start addr | Offset | End addr | Size | VM area description
========================================================================================================================
| | | |
0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
|
| Kernel-space virtual memory, shared between all processes:
____________________________________________________________|___________________________________________________________
| | | |
ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
ffff880000000000 | -120 TB | ffff887fffffffff | 0.5 TB | LDT remap for PTI
ffff888000000000 | -119.5 TB | ffffc87fffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
ffffc88000000000 | -55.5 TB | ffffc8ffffffffff | 0.5 TB | ... unused hole
ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 56-bit one from here on:
____________________________________________________________|____________________________________________________________
| | | |
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
Code: Select all
pub fn create_kernel_page_table(
allocator: &mut impl FrameAllocator<Size4KiB>,
) -> (PhysFrame, OffsetPageTable<'static>) {
let frame = allocator.allocate_frame().unwrap();
// UEFI maps vmem with a zero offset.
let phys_offset = VirtAddr::zero();
let mut page_table = unsafe {
let inner = &mut *((frame.start_address().as_u64() + phys_offset.as_u64()) as *mut _);
OffsetPageTable::new(inner, phys_offset)
};
// map first 64TiB of physical memory onto kernel virtual memory
// 0xffff888000000000
// so 0xffff888000000000 -> PA 0x0
// 0xffff888000000004 -> PA 0x4...etc
{
let offset = VirtAddr::new(0xffff888000000000);
let start = PhysFrame::containing_address(PhysAddr::zero());
let end = PhysFrame::containing_address(PhysAddr::zero() + n_tib_bytes(64));
for frame in PhysFrame::<Size1GiB>::range_inclusive(start, end) {
let page = Page::from_start_address(offset + frame.start_address().as_u64()).unwrap();
unsafe {
page_table
.map_to(
page,
frame,
PageTableFlags::PRESENT
| PageTableFlags::WRITABLE
| PageTableFlags::HUGE_PAGE,
allocator,
)
.expect("failed to map page")
.flush();
}
}
}
(frame, page_table)
}
Code: Select all
#[entry]
unsafe fn main(handle: Handle, mut st: SystemTable<Boot>) -> Status {
uefi_services::init(&mut st).expect("unable to initialize UEFI services");
log::set_max_level(LevelFilter::Trace);
st.stdout().clear().unwrap();
let bs = st.boot_services();
let mut alloc = BootFrameAllocator::new(bs);
let (page_table_frame, mut page_table) = frame_allocator::create_kernel_page_table(&mut alloc);
trace!("created kernel page table at {:?}", page_table_frame);
let kernel_elf = unsafe { NonNull::from(load_kernel(bs)).as_ref() };
let mut loader = KernelLoader::new(kernel_elf, &mut page_table, &mut alloc);
loader.relocate();
let args = args
.map(|args| args.into_boxed_str())
.map(|args| Box::leak(args) as &'static str);
let sizes = bs.memory_map_size();
let mut mmap_storage = vec![0; sizes.map_size + 2 * sizes.entry_size].leak();
let mut memory_descriptors: Vec<&'static MemoryDescriptor> =
Vec::with_capacity(sizes.map_size / sizes.entry_size);
let entry_point: extern "sysv64" fn(&BootData) = unsafe { mem::transmute(loader.entry()) };
trace!("entry_point = {:p}", entry_point as *const u8);
st.stdout().clear().unwrap();
info!("bon voyage!");
let (system_table, mmap_info) = st.exit_boot_services(handle, mmap_storage).unwrap();
mmap_info.collect_into(&mut memory_descriptors);
load_page_table_pointer(page_table_frame);
entry_point(&BootData {
magic: yakern_types::boot::MAGIC.into(),
kernel_elf_addr: kernel_elf.into(),
boot_page_table: page_table_frame.start_address().as_u64() as usize,
kind: BootDataKind::Uefi(BootDataUefi {
system_table,
memory_descriptors: (memory_descriptors.leak() as &'static [_]).into(),
}),
});
arch::drop_dead()
}
Code: Select all
load_page_table_pointer(page_table_frame);
entry_point(&BootData {
magic: yakern_types::boot::MAGIC.into(),
kernel_elf_addr: kernel_elf.into(),
boot_page_table: page_table_frame.start_address().as_u64() as usize,
kind: BootDataKind::Uefi(BootDataUefi {
system_table,
memory_descriptors: (memory_descriptors.leak() as &'static [_]).into(),
}),
});
That's why I added a blanket identity map hack on load_page_table_pointer:
Code: Select all
fn create_kernel_page_table(
allocator: &mut impl FrameAllocator<Size4KiB>,
) -> (PhysFrame, OffsetPageTable<'static>) {
let frame = allocator.allocate_frame().unwrap();
// UEFI maps vmem with a zero offset.
let phys_offset = VirtAddr::zero();
let mut page_table = unsafe {
let inner = &mut *((frame.start_address().as_u64() + phys_offset.as_u64()) as *mut _);
OffsetPageTable::new(inner, phys_offset)
};
// Map 0-128 TiB as identity mapping by default
{
let start = PhysFrame::containing_address(PhysAddr::zero());
let end = PhysFrame::containing_address(PhysAddr::zero() + n_tib_bytes(127));
for frame in PhysFrame::<Size1GiB>::range_inclusive(start, end) {
unsafe {
page_table
.identity_map(
frame,
PageTableFlags::PRESENT
| PageTableFlags::WRITABLE
| PageTableFlags::HUGE_PAGE,
allocator,
)
.expect("failed to map page")
.ignore();
}
}
}
// map first 64TiB of physical memory onto kernel virtual memory
// 0xffff888000000000
// so 0xffff888000000000 -> PA 0x0
// 0xffff888000000004 -> PA 0x4...etc
{
let offset = VirtAddr::new(0xffff888000000000);
let start = PhysFrame::containing_address(PhysAddr::zero());
let end = PhysFrame::containing_address(PhysAddr::zero() + n_tib_bytes(64));
for frame in PhysFrame::<Size1GiB>::range_inclusive(start, end) {
let page = Page::from_start_address(offset + frame.start_address().as_u64()).unwrap();
unsafe {
page_table
.map_to(
page,
frame,
PageTableFlags::PRESENT
| PageTableFlags::WRITABLE
| PageTableFlags::HUGE_PAGE,
allocator,
)
.expect("failed to map page")
.flush();
}
}
}
(frame, page_table)
}
Not only that it will limit my memory the maximum memory of the kernel to support only up to 127 TiB physical memory, but it seems like I should map the OG UEFI memory map into my kernel page table, and that's enough without having to do this identity mapping overkill that could crush TLB performance.
What is the standard way of handling these situations in UEFI? More specifically, what is the more proper way of doing a kernel page table transition where the original memory map is still needed in some extent. I hope this could be as immutable as possible.