|
| 1 | +/* Pagetable-based Virtual Machine hypervisor */ |
| 2 | + |
| 3 | +#include <kernel.h> |
| 4 | +#include "pvm.h" |
| 5 | + |
| 6 | +#define PVM_CPUID_SIGNATURE 0x40000000 |
| 7 | +#define PVM_CPUID_VENDOR_FEATURES 0x40000002 |
| 8 | + |
| 9 | +#define KVM_SIGNATURE "KVMKVMKVM\0\0\0" |
| 10 | +#define PVM_SIGNATURE 0x4d5650 /* "PVM\0" */ |
| 11 | + |
| 12 | +#define MSR_PVM_LINEAR_ADDRESS_RANGE 0x4b564df0 |
| 13 | +#define MSR_PVM_VCPU_STRUCT 0x4b564df1 |
| 14 | +#define MSR_PVM_SUPERVISOR_RSP 0x4b564df2 |
| 15 | +#define MSR_PVM_EVENT_ENTRY 0x4b564df4 |
| 16 | +#define MSR_PVM_RETU_RIP 0x4b564df5 |
| 17 | +#define MSR_PVM_RETS_RIP 0x4b564df6 |
| 18 | +#define MSR_PVM_SWITCH_CR3 0x4b564df7 |
| 19 | + |
| 20 | +boolean pvm_detected; |
| 21 | + |
| 22 | +boolean pvm_detect(void) |
| 23 | +{ |
| 24 | + u64 flags = read_flags(); |
| 25 | + if (!(flags & U64_FROM_BIT(EFLAG_INTERRUPT))) |
| 26 | + return false; |
| 27 | + u64 cs; |
| 28 | + asm volatile("mov %%cs,%0" : "=r" (cs) : ); |
| 29 | + if ((cs & 0x3) != 3) /* check if CPL == 3 */ |
| 30 | + return false; |
| 31 | + u32 v[4]; |
| 32 | + pvm_cpuid(PVM_CPUID_SIGNATURE, 0, v); |
| 33 | + if ((v[0] < PVM_CPUID_VENDOR_FEATURES) || runtime_memcmp(&v[1], KVM_SIGNATURE, 3 * sizeof(u32))) |
| 34 | + return false; |
| 35 | + pvm_cpuid(PVM_CPUID_VENDOR_FEATURES, 0, v); |
| 36 | + return pvm_detected = (v[1] == PVM_SIGNATURE); |
| 37 | +} |
| 38 | + |
| 39 | +range pvm_get_addr_range(void) |
| 40 | +{ |
| 41 | + u64 addr_range = read_msr(MSR_PVM_LINEAR_ADDRESS_RANGE); |
| 42 | + u64 pml4_index_start = addr_range & 0x1ff; |
| 43 | + u64 pml4_index_end = (addr_range >> 16) & 0x1ff; |
| 44 | + return irange((0x1fffe00 | pml4_index_start) << PT_SHIFT_L1, |
| 45 | + (0x1fffe00 | pml4_index_end) << PT_SHIFT_L1); |
| 46 | +} |
| 47 | + |
| 48 | +closure_func_basic(thunk, void, pvm_cpu_init) |
| 49 | +{ |
| 50 | + /* the PVM vCPU struct must be page-aligned */ |
| 51 | + struct pvm_vcpu *pvm = allocate_zero((heap)heap_page_backed(get_kernel_heaps()), PAGESIZE); |
| 52 | + assert(pvm != INVALID_ADDRESS); |
| 53 | + |
| 54 | + /* PVM requires user-mode segment selectors to have the same values as used on Linux */ |
| 55 | + pvm->user_ss = 0x2b; |
| 56 | + pvm->user_cs = 0x33; |
| 57 | + |
| 58 | + write_msr(MSR_PVM_VCPU_STRUCT, physical_from_virtual(pvm)); |
| 59 | + cpuinfo ci = current_cpu(); |
| 60 | + ci->m.pvm = pvm; |
| 61 | + write_msr(KERNEL_GS_MSR, u64_from_pointer(ci)); |
| 62 | + u64 cr3; |
| 63 | + mov_from_cr("cr3", cr3); |
| 64 | + /* In order for the direct switching feature to be enabled (i.e. to switch between user mode and |
| 65 | + * supervisor mode without a VM exit), PVM requires CR3 values for the two modes to be different |
| 66 | + * from each other; since Nanos uses a single CR3 value, flip one bit between the user CR3 and |
| 67 | + * the supervisor CR3 to make them appear as different values (a CR3 value must be page-aligned, |
| 68 | + * so the flipped bit will not cause a different page table root to be used). */ |
| 69 | + write_msr(MSR_PVM_SWITCH_CR3, cr3 | 1); |
| 70 | + extern void *pvm_event_entry; |
| 71 | + write_msr(MSR_PVM_EVENT_ENTRY, u64_from_pointer(&pvm_event_entry)); |
| 72 | + extern void *pvm_retu, *pvm_rets; |
| 73 | + write_msr(MSR_PVM_RETU_RIP, u64_from_pointer(&pvm_retu)); |
| 74 | + write_msr(MSR_PVM_RETS_RIP, u64_from_pointer(&pvm_rets)); |
| 75 | + /* Configure initial stack for PVM events in user mode: the actual stack (interrupt vs |
| 76 | + * exception) is selected in the event handler. */ |
| 77 | + write_msr(MSR_PVM_SUPERVISOR_RSP, u64_from_pointer(ci->m.int_stack)); |
| 78 | +} |
| 79 | + |
| 80 | +void pvm_setup(kernel_heaps kh) |
| 81 | +{ |
| 82 | + thunk t = closure_func(heap_general(kh), thunk, pvm_cpu_init); |
| 83 | + assert(t != INVALID_ADDRESS); |
| 84 | + apply(t); |
| 85 | + register_percpu_init(t); |
| 86 | +} |
| 87 | + |
| 88 | +void pvm_cpuid(u32 leaf, u32 subleaf, u32 *v) |
| 89 | +{ |
| 90 | + asm volatile(".byte 0x0f,0x01,0x3c,0x25,0x50,0x56,0x4d,0xff,0x0f,0xa2" : |
| 91 | + "=a" (v[0]), "=b" (v[1]), "=c" (v[2]), "=d" (v[3]) : "0" (leaf), "2" (subleaf)); |
| 92 | +} |
| 93 | + |
| 94 | +void pvm_event(boolean save_frame) |
| 95 | +{ |
| 96 | + cpuinfo ci = current_cpu(); |
| 97 | + struct pvm_vcpu *pvm = ci->m.pvm; |
| 98 | + context_frame f = get_current_context(ci)->frame; |
| 99 | + if (save_frame) { |
| 100 | + f[FRAME_RIP] = pvm->rip; |
| 101 | + f[FRAME_RSP] = pvm->rsp; |
| 102 | + f[FRAME_EFLAGS] = pvm->eflags; |
| 103 | + f[FRAME_RCX] = pvm->rcx; |
| 104 | + f[FRAME_R11] = pvm->r11; |
| 105 | + f[FRAME_VECTOR] = pvm->event_vector; |
| 106 | + f[FRAME_ERROR_CODE] = pvm->event_errcode; |
| 107 | + } |
| 108 | + u32 vector = f[FRAME_VECTOR]; |
| 109 | + if (vector == 14) /* page fault */ |
| 110 | + f[FRAME_CR2] = pvm->cr2; |
| 111 | + void *stack = (vector < INTERRUPT_VECTOR_START) ? ci->m.exception_stack : ci->m.int_stack; |
| 112 | + switch_stack(stack, common_handler); |
| 113 | +} |
| 114 | + |
| 115 | +void pvm_syscall(context user_ctx) |
| 116 | +{ |
| 117 | + extern void (*syscall)(context ctx); |
| 118 | + cpuinfo ci = current_cpu(); |
| 119 | + struct pvm_vcpu *pvm = ci->m.pvm; |
| 120 | + context_frame f = user_ctx->frame; |
| 121 | + f[FRAME_RIP] = pvm->rip; |
| 122 | + f[FRAME_RSP] = pvm->rsp; |
| 123 | + f[FRAME_EFLAGS] = pvm->eflags; |
| 124 | + f[FRAME_RCX] = pvm->rcx; |
| 125 | + f[FRAME_R11] = pvm->r11; |
| 126 | + syscall(user_ctx); |
| 127 | + pvm_frame_return(f); |
| 128 | +} |
| 129 | + |
| 130 | +void __attribute__((noreturn)) pvm_frame_return(context_frame f) |
| 131 | +{ |
| 132 | + if (f[FRAME_CS] & 0x3) { /* (CPL != 0) means return to user mode */ |
| 133 | + cpuinfo ci = current_cpu(); |
| 134 | + context_frame f = get_current_context(ci)->frame; |
| 135 | + struct pvm_vcpu *pvm = ci->m.pvm; |
| 136 | + pvm->rip = f[FRAME_RIP]; |
| 137 | + pvm->rsp = f[FRAME_RSP]; |
| 138 | + pvm->eflags = f[FRAME_EFLAGS]; |
| 139 | + pvm->rcx = f[FRAME_RCX]; |
| 140 | + pvm->r11 = f[FRAME_R11]; |
| 141 | + } |
| 142 | + pvm_event_return(f); |
| 143 | +} |
0 commit comments