Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 478b882

Browse files
author
Francesco Lavra
committed
x86: add PVM (Pagetable-based Virtual Machine) hypervisor support
This hypervisor runs the guest code natively without needing hardware VM acceleration. https://lwn.net/Articles/963718/ The code of the patched Linux kernel that implements PVM is at https://github.com/virt-pvm/linux, and the hypervisor specification is at https://github.com/virt-pvm/linux/blob/pvm/Documentation/virt/kvm/x86/pvm-spec.rst.
1 parent 8909822 commit 478b882

File tree

9 files changed

+375
-15
lines changed

9 files changed

+375
-15
lines changed

platform/pc/Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ SRCS-kernel.elf= \
126126
$(SRCDIR)/x86_64/kernel_machine.c \
127127
$(SRCDIR)/x86_64/mp.c \
128128
$(SRCDIR)/x86_64/page.c \
129+
$(SRCDIR)/x86_64/pvm.c \
130+
$(SRCDIR)/x86_64/pvm_asm.s \
129131
$(SRCDIR)/x86_64/rtc.c \
130132
$(SRCDIR)/x86_64/serial.c \
131133
$(SRCDIR)/x86_64/synth.c \
@@ -231,7 +233,10 @@ endif
231233
$(AWK) 'BEGIN{getline l < "$(PLATFORMDIR)/test-libs"}/TEST-LIBS/{gsub("TEST-LIBS",l)}/ARCH_DIR/{gsub("ARCH_DIR","$(PLATFORMOBJDIR)")}1' $(ROOTDIR)/test/runtime/$(TARGET).manifest | \
232234
$(MKFS) $(TARGET_ROOT_OPT) -b $(BOOTIMG) $(MKFS_UEFI) -k $(KERNEL) -t "(debug_exit:t)" $(TRACELOG_MKFS_OPTS) $(EXTRA_MKFS_OPTS) $(IMAGE)
233235

234-
$(OBJDIR)/src/$(ARCH)/crt0.o: $(OBJDIR)/frame.inc
236+
$(OBJDIR)/src/$(ARCH)/crt0.o \
237+
$(OBJDIR)/src/$(ARCH)/pvm_asm.o \
238+
$(OBJDIR)/src/$(ARCH)/frtace.o \
239+
: $(OBJDIR)/frame.inc
235240

236241
$(OBJDIR)/frame.inc: $(ARCHDIR)/frame.h
237242
$(call cmd,sed)

platform/pc/service.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <xen_platform.h>
2020
#include <virtio/virtio.h>
2121
#include <vmware/vmware.h>
22+
#include "pvm.h"
2223
#include "serial.h"
2324

2425
#define BOOT_PARAM_OFFSET_E820_ENTRIES 0x01E8
@@ -398,7 +399,12 @@ void init_service(u64 rdi, u64 rsi, hvm_start_info start_info)
398399
serial_init();
399400
early_init_debug("init_service");
400401

401-
pv_ops.cpuid = x86_cpuid;
402+
if (pvm_detect()) {
403+
kvmem.r = pvm_get_addr_range();
404+
pv_ops.cpuid = pvm_cpuid;
405+
} else {
406+
pv_ops.cpuid = x86_cpuid;
407+
}
402408
if (do_setup_initmap)
403409
setup_initmap();
404410
find_initial_pages();
@@ -413,7 +419,10 @@ void init_service(u64 rdi, u64 rsi, hvm_start_info start_info)
413419
u64 offset_cpuid = u64_from_pointer(pv_ops.cpuid) + kas_kern_offset - kernel_phys_offset;
414420
pv_ops.cpuid = pointer_from_u64(offset_cpuid);
415421

416-
pv_ops.frame_return = x86_frame_return;
422+
if (pvm_detected)
423+
pv_ops.frame_return = pvm_frame_return;
424+
else
425+
pv_ops.frame_return = x86_frame_return;
417426
init_kernel_heaps();
418427
if (cmdline)
419428
create_region(u64_from_pointer(cmdline), cmdline_size, REGION_CMDLINE);
@@ -454,6 +463,8 @@ extern boolean init_tsc_timer(kernel_heaps kh);
454463

455464
void detect_hypervisor(kernel_heaps kh)
456465
{
466+
if (pvm_detected)
467+
pvm_setup(kh);
457468
if (!kvm_detect(kh)) {
458469
init_debug("probing for Xen hypervisor");
459470
if (!xen_detect(kh)) {

src/x86_64/interrupt.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#define int_debug(x, ...)
1313
#endif
1414

15-
#define INTERRUPT_VECTOR_START 32 /* end of exceptions; defined by architecture */
1615
#define MAX_INTERRUPT_VECTORS 256 /* as defined by architecture; we may have less */
1716

1817
typedef struct inthandler {

src/x86_64/kernel_machine.h

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -240,25 +240,30 @@ static inline void xgetbv(u32 ecx, u32 *eax, u32 *edx)
240240
}
241241

242242
#ifdef KERNEL
243+
#include "pvm.h"
244+
243245
void cmdline_consume(sstring opt_name, cmdline_handler h);
244246
void boot_params_apply(tuple t);
245247

246248
/* syscall entry */
247249

248-
static inline void set_syscall_handler(void *syscall_entry)
249-
{
250-
write_msr(LSTAR_MSR, u64_from_pointer(syscall_entry));
251-
u32 selectors = ((USER_CODE32_SELECTOR | 0x3) << 16) | KERNEL_CODE_SELECTOR;
252-
write_msr(STAR_MSR, (u64)selectors << 32);
253-
write_msr(SFMASK_MSR, U64_FROM_BIT(EFLAG_INTERRUPT) | U64_FROM_BIT(EFLAG_TRAP));
254-
write_msr(EFER_MSR, read_msr(EFER_MSR) | EFER_SCE);
255-
}
256-
257250
extern void syscall_enter(void);
258251

259252
static inline void init_syscall_handler()
260253
{
261-
set_syscall_handler(syscall_enter);
254+
void *syscall_entry;
255+
if (pvm_detected) {
256+
/* Do not set the STAR MSR, because PVM does not allow setting the user-mode selector to a
257+
* different value than the value used on Linux. */
258+
syscall_entry = pvm_syscall_entry;
259+
} else {
260+
u32 selectors = ((USER_CODE32_SELECTOR | 0x3) << 16) | KERNEL_CODE_SELECTOR;
261+
write_msr(STAR_MSR, (u64)selectors << 32);
262+
syscall_entry = syscall_enter;
263+
}
264+
write_msr(LSTAR_MSR, u64_from_pointer(syscall_entry));
265+
write_msr(SFMASK_MSR, U64_FROM_BIT(EFLAG_INTERRUPT) | U64_FROM_BIT(EFLAG_TRAP));
266+
write_msr(EFER_MSR, read_msr(EFER_MSR) | EFER_SCE);
262267
}
263268

264269
static inline void set_page_write_protect(boolean enable)
@@ -324,6 +329,8 @@ struct cpuinfo_machine {
324329
/* Monotonic clock timestamp when the lapic timer is supposed to fire; used to re-arm the timer
325330
* when it fires too early (based on what the monotonic clock source says). */
326331
timestamp lapic_timer_expiry;
332+
333+
struct pvm_vcpu *pvm;
327334
};
328335

329336
typedef struct cpuinfo *cpuinfo;

src/x86_64/machine.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,13 @@ static inline __attribute__((always_inline)) u8 is_immediate_integer(value v)
2424

2525
#ifdef KERNEL
2626

27+
/* Tagged memory is (VA_TAG_OFFSET + VA_TAG_WIDTH) bits long, and needs to be aligned to its length.
28+
* The PVM hypervisor allocates 44 bits of address space for the guest; to ensure tagged memory can
29+
* be carved out from this space without touching its limits (because memory regions around its
30+
* limits are used for other purposes), tagged memory must be at least 2 bits shorter than the total
31+
* address space. */
2732
#define VA_TAG_OFFSET 38
28-
#define VA_TAG_WIDTH 8
33+
#define VA_TAG_WIDTH 4
2934

3035
static inline __attribute__((always_inline)) value_tag tagof(void* v) {
3136
u64 x = u64_from_pointer(v);

src/x86_64/pvm.c

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/* Pagetable-based Virtual Machine hypervisor */
2+
3+
#include <kernel.h>
4+
#include "pvm.h"
5+
6+
#define PVM_CPUID_SIGNATURE 0x40000000
7+
#define PVM_CPUID_VENDOR_FEATURES 0x40000002
8+
9+
#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
10+
#define PVM_SIGNATURE 0x4d5650 /* "PVM\0" */
11+
12+
#define MSR_PVM_LINEAR_ADDRESS_RANGE 0x4b564df0
13+
#define MSR_PVM_VCPU_STRUCT 0x4b564df1
14+
#define MSR_PVM_SUPERVISOR_RSP 0x4b564df2
15+
#define MSR_PVM_EVENT_ENTRY 0x4b564df4
16+
#define MSR_PVM_RETU_RIP 0x4b564df5
17+
#define MSR_PVM_RETS_RIP 0x4b564df6
18+
#define MSR_PVM_SWITCH_CR3 0x4b564df7
19+
20+
boolean pvm_detected;
21+
22+
boolean pvm_detect(void)
23+
{
24+
u64 flags = read_flags();
25+
if (!(flags & U64_FROM_BIT(EFLAG_INTERRUPT)))
26+
return false;
27+
u64 cs;
28+
asm volatile("mov %%cs,%0" : "=r" (cs) : );
29+
if ((cs & 0x3) != 3) /* check if CPL == 3 */
30+
return false;
31+
u32 v[4];
32+
pvm_cpuid(PVM_CPUID_SIGNATURE, 0, v);
33+
if ((v[0] < PVM_CPUID_VENDOR_FEATURES) || runtime_memcmp(&v[1], KVM_SIGNATURE, 3 * sizeof(u32)))
34+
return false;
35+
pvm_cpuid(PVM_CPUID_VENDOR_FEATURES, 0, v);
36+
return pvm_detected = (v[1] == PVM_SIGNATURE);
37+
}
38+
39+
range pvm_get_addr_range(void)
40+
{
41+
u64 addr_range = read_msr(MSR_PVM_LINEAR_ADDRESS_RANGE);
42+
u64 pml4_index_start = addr_range & 0x1ff;
43+
u64 pml4_index_end = (addr_range >> 16) & 0x1ff;
44+
return irange((0x1fffe00 | pml4_index_start) << PT_SHIFT_L1,
45+
(0x1fffe00 | pml4_index_end) << PT_SHIFT_L1);
46+
}
47+
48+
closure_func_basic(thunk, void, pvm_cpu_init)
49+
{
50+
/* the PVM vCPU struct must be page-aligned */
51+
struct pvm_vcpu *pvm = allocate_zero((heap)heap_page_backed(get_kernel_heaps()), PAGESIZE);
52+
assert(pvm != INVALID_ADDRESS);
53+
54+
/* PVM requires user-mode segment selectors to have the same values as used on Linux */
55+
pvm->user_ss = 0x2b;
56+
pvm->user_cs = 0x33;
57+
58+
write_msr(MSR_PVM_VCPU_STRUCT, physical_from_virtual(pvm));
59+
cpuinfo ci = current_cpu();
60+
ci->m.pvm = pvm;
61+
write_msr(KERNEL_GS_MSR, u64_from_pointer(ci));
62+
u64 cr3;
63+
mov_from_cr("cr3", cr3);
64+
/* In order for the direct switching feature to be enabled (i.e. to switch between user mode and
65+
* supervisor mode without a VM exit), PVM requires CR3 values for the two modes to be different
66+
* from each other; since Nanos uses a single CR3 value, flip one bit between the user CR3 and
67+
* the supervisor CR3 to make them appear as different values (a CR3 value must be page-aligned,
68+
* so the flipped bit will not cause a different page table root to be used). */
69+
write_msr(MSR_PVM_SWITCH_CR3, cr3 | 1);
70+
extern void *pvm_event_entry;
71+
write_msr(MSR_PVM_EVENT_ENTRY, u64_from_pointer(&pvm_event_entry));
72+
extern void *pvm_retu, *pvm_rets;
73+
write_msr(MSR_PVM_RETU_RIP, u64_from_pointer(&pvm_retu));
74+
write_msr(MSR_PVM_RETS_RIP, u64_from_pointer(&pvm_rets));
75+
/* Configure initial stack for PVM events in user mode: the actual stack (interrupt vs
76+
* exception) is selected in the event handler. */
77+
write_msr(MSR_PVM_SUPERVISOR_RSP, u64_from_pointer(ci->m.int_stack));
78+
}
79+
80+
void pvm_setup(kernel_heaps kh)
81+
{
82+
thunk t = closure_func(heap_general(kh), thunk, pvm_cpu_init);
83+
assert(t != INVALID_ADDRESS);
84+
apply(t);
85+
register_percpu_init(t);
86+
}
87+
88+
void pvm_cpuid(u32 leaf, u32 subleaf, u32 *v)
89+
{
90+
asm volatile(".byte 0x0f,0x01,0x3c,0x25,0x50,0x56,0x4d,0xff,0x0f,0xa2" :
91+
"=a" (v[0]), "=b" (v[1]), "=c" (v[2]), "=d" (v[3]) : "0" (leaf), "2" (subleaf));
92+
}
93+
94+
void pvm_event(boolean save_frame)
95+
{
96+
cpuinfo ci = current_cpu();
97+
struct pvm_vcpu *pvm = ci->m.pvm;
98+
context_frame f = get_current_context(ci)->frame;
99+
if (save_frame) {
100+
f[FRAME_RIP] = pvm->rip;
101+
f[FRAME_RSP] = pvm->rsp;
102+
f[FRAME_EFLAGS] = pvm->eflags;
103+
f[FRAME_RCX] = pvm->rcx;
104+
f[FRAME_R11] = pvm->r11;
105+
f[FRAME_VECTOR] = pvm->event_vector;
106+
f[FRAME_ERROR_CODE] = pvm->event_errcode;
107+
}
108+
u32 vector = f[FRAME_VECTOR];
109+
if (vector == 14) /* page fault */
110+
f[FRAME_CR2] = pvm->cr2;
111+
void *stack = (vector < INTERRUPT_VECTOR_START) ? ci->m.exception_stack : ci->m.int_stack;
112+
switch_stack(stack, common_handler);
113+
}
114+
115+
void pvm_syscall(context user_ctx)
116+
{
117+
extern void (*syscall)(context ctx);
118+
cpuinfo ci = current_cpu();
119+
struct pvm_vcpu *pvm = ci->m.pvm;
120+
context_frame f = user_ctx->frame;
121+
f[FRAME_RIP] = pvm->rip;
122+
f[FRAME_RSP] = pvm->rsp;
123+
f[FRAME_EFLAGS] = pvm->eflags;
124+
f[FRAME_RCX] = pvm->rcx;
125+
f[FRAME_R11] = pvm->r11;
126+
syscall(user_ctx);
127+
pvm_frame_return(f);
128+
}
129+
130+
void __attribute__((noreturn)) pvm_frame_return(context_frame f)
131+
{
132+
if (f[FRAME_CS] & 0x3) { /* (CPL != 0) means return to user mode */
133+
cpuinfo ci = current_cpu();
134+
context_frame f = get_current_context(ci)->frame;
135+
struct pvm_vcpu *pvm = ci->m.pvm;
136+
pvm->rip = f[FRAME_RIP];
137+
pvm->rsp = f[FRAME_RSP];
138+
pvm->eflags = f[FRAME_EFLAGS];
139+
pvm->rcx = f[FRAME_RCX];
140+
pvm->r11 = f[FRAME_R11];
141+
}
142+
pvm_event_return(f);
143+
}

src/x86_64/pvm.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/* Pagetable-based Virtual Machine hypervisor */
2+
3+
#ifndef PVM_H_
4+
#define PVM_H_
5+
6+
struct pvm_vcpu {
7+
u64 event_flags;
8+
u32 event_errcode;
9+
u32 event_vector;
10+
u64 cr2;
11+
u64 reserved0[5];
12+
u16 user_cs, user_ss;
13+
u32 reserved1;
14+
u64 reserved2;
15+
u64 user_gsbase;
16+
u32 eflags;
17+
u32 pkru;
18+
u64 rip;
19+
u64 rsp;
20+
u64 rcx;
21+
u64 r11;
22+
};
23+
24+
boolean pvm_detect(void);
25+
range pvm_get_addr_range(void);
26+
void pvm_setup(kernel_heaps kh);
27+
28+
void pvm_cpuid(u32 leaf, u32 subleaf, u32 *v);
29+
void pvm_syscall_entry(void);
30+
void pvm_frame_return(context_frame f) __attribute__((noreturn));
31+
void pvm_event_return(context_frame f) __attribute__((noreturn));
32+
33+
extern boolean pvm_detected;
34+
35+
#endif

0 commit comments

Comments
 (0)