Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ steps:
# All system call tests.
- <<: *common
label: ":toolbox: System call tests (AMD64)"
command: make BAZEL_OPTIONS=--test_tag_filters=-allsave syscall-tests
command: make BAZEL_OPTIONS=--test_tag_filters=-allsave,-runsc_slimvm syscall-tests
parallelism: 20
agents:
<<: *platform_specific_agents
Expand Down
19 changes: 18 additions & 1 deletion pkg/cpuid/cpuid_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,24 @@ func (fs FeatureSet) VirtualAddressBits() uint32 {
//go:nosplit
func (fs FeatureSet) PhysicalAddressBits() uint32 {
ax, _, _, _ := fs.query(addressSizes)
return ax & 0xff
physBits := ax & 0xff
if !fs.AMD() {
return physBits
}

maxExtended, _, _, _ := fs.query(extendedFunctionInfo)
if maxExtended < uint32(amdMemoryEncryptionInfo) {
return physBits
}

memEncAX, memEncBX, _, _ := fs.query(amdMemoryEncryptionInfo)
if memEncAX&amdMemoryEncryptionFeatureMask == 0 {
return physBits
}
// AMD memory encryption reduces usable physical address width by the
// CPUID-reported amount. Match Linux's
// arch/x86/kernel/cpu/amd.c:early_detect_mem_encrypt().
return physBits - ((memEncBX >> amdPhysAddrReductionShift) & amdPhysAddrReductionMask)
}

// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4.
Expand Down
47 changes: 29 additions & 18 deletions pkg/cpuid/native_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,25 @@ const xSaveInfoNumLeaves = 64 // Maximum number of xSaveInfo leaves.

// The "extended" functions.
const (
extendedStart cpuidFunction = 0x80000000
extendedFunctionInfo cpuidFunction = extendedStart + 0 // Returns highest available extended function in eax.
extendedFeatures = extendedStart + 1 // Returns some extended feature bits in edx and ecx.
processorBrandString2 = extendedStart + 2 // Processor Name String Identifier.
processorBrandString3 = extendedStart + 3 // Processor Name String Identifier.
processorBrandString4 = extendedStart + 4 // Processor Name String Identifier.
l1CacheAndTLBInfo = extendedStart + 5 // Returns L2 cache information.
l2CacheInfo = extendedStart + 6 // Returns L2 cache information.
addressSizes = extendedStart + 8 // Physical and virtual address sizes.
extendedStart cpuidFunction = 0x80000000
extendedFunctionInfo cpuidFunction = extendedStart + 0 // Returns highest available extended function in eax.
extendedFeatures = extendedStart + 1 // Returns some extended feature bits in edx and ecx.
processorBrandString2 = extendedStart + 2 // Processor Name String Identifier.
processorBrandString3 = extendedStart + 3 // Processor Name String Identifier.
processorBrandString4 = extendedStart + 4 // Processor Name String Identifier.
l1CacheAndTLBInfo = extendedStart + 5 // Returns L2 cache information.
l2CacheInfo = extendedStart + 6 // Returns L2 cache information.
addressSizes = extendedStart + 8 // Physical and virtual address sizes.
amdMemoryEncryptionInfo = extendedStart + 31 // AMD memory encryption information.
)

// AMD-defined memory encryption feature bits and fields.
const (
amdSecureMemoryEncryption = 1 << 0
amdSecureEncryptedVirtualization = 1 << 1
amdMemoryEncryptionFeatureMask = amdSecureMemoryEncryption | amdSecureEncryptedVirtualization
amdPhysAddrReductionShift = 6
amdPhysAddrReductionMask = 0x3f
)

var allowedBasicFunctions = [...]bool{
Expand All @@ -84,14 +94,15 @@ var allowedBasicFunctions = [...]bool{
}

var allowedExtendedFunctions = [...]bool{
extendedFunctionInfo - extendedStart: true,
extendedFeatures - extendedStart: true,
addressSizes - extendedStart: true,
processorBrandString2 - extendedStart: true,
processorBrandString3 - extendedStart: true,
processorBrandString4 - extendedStart: true,
l1CacheAndTLBInfo - extendedStart: true,
l2CacheInfo - extendedStart: true,
extendedFunctionInfo - extendedStart: true,
extendedFeatures - extendedStart: true,
addressSizes - extendedStart: true,
processorBrandString2 - extendedStart: true,
processorBrandString3 - extendedStart: true,
processorBrandString4 - extendedStart: true,
l1CacheAndTLBInfo - extendedStart: true,
l2CacheInfo - extendedStart: true,
amdMemoryEncryptionInfo - extendedStart: true,
}

// Function executes a CPUID function.
Expand Down Expand Up @@ -119,7 +130,7 @@ func (i *In) normalize() {
switch cpuidFunction(i.Eax) {
case vendorID, featureInfo, intelCacheDescriptors, extendedFunctionInfo, extendedFeatures:
i.Ecx = 0 // Ignore.
case processorBrandString2, processorBrandString3, processorBrandString4, l1CacheAndTLBInfo, l2CacheInfo:
case processorBrandString2, processorBrandString3, processorBrandString4, l1CacheAndTLBInfo, l2CacheInfo, amdMemoryEncryptionInfo:
i.Ecx = 0 // Ignore.
case intelDeterministicCacheParams, extendedFeatureInfo:
// Preserve i.Ecx.
Expand Down
6 changes: 6 additions & 0 deletions pkg/hostarch/hostarch_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,15 @@ const (
// HugePageSize is the system huge page size.
HugePageSize = 1 << HugePageShift

// JumboPageSize is the 1GB jumbo page size.
JumboPageSize = 1 << JumboPageShift

// CacheLineSize is the size of the cache line.
CacheLineSize = 1 << CacheLineShift

// JumboPageShift is the binary log of jumbo page whose size is 1GB.
JumboPageShift = 30

// CacheLineShift is the binary log of the cache line size.
CacheLineShift = 6
)
Expand Down
6 changes: 6 additions & 0 deletions pkg/hostarch/hostarch_x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ const (
// HugePageSize is the system huge page size.
HugePageSize = 1 << HugePageShift

// JumboPageSize is the 1GB jumbo page size.
JumboPageSize = 1 << JumboPageShift

// CacheLineSize is the size of the cache line.
CacheLineSize = 1 << CacheLineShift

Expand All @@ -35,6 +38,9 @@ const (
// HugePageShift is the binary log of the system huge page size.
HugePageShift = 21

// JumboPageShift is the binary log of jumbo page whose size is 1GB.
JumboPageShift = 30

// CacheLineShift is the binary log of the cache line size.
CacheLineShift = 6
)
Expand Down
1 change: 1 addition & 0 deletions pkg/hostarch/sizes_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const (
PageMask = PageSize - 1
HugePageMask = HugePageSize - 1
CacheLineMask = CacheLineSize - 1
JumboPageMask = ^uintptr(JumboPageSize - 1)
)

type bytecount interface {
Expand Down
12 changes: 12 additions & 0 deletions pkg/ring0/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch/fpu"
)

const (
// CPUIntel is Intel CPU.
CPUIntel uint64 = iota

// CPUAMD is AMD (and compatible) CPU.
CPUAMD
)

var (
CPUVendor uint64
)

// Kernel is a global kernel object.
//
// This contains global state, shared by multiple CPUs.
Expand Down
11 changes: 11 additions & 0 deletions pkg/ring0/defs_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ type kernelEntry struct {
// kernelCR3 is the cr3 used for sentry kernel.
kernelCR3 uintptr

// whether enable VMCALL
enableVMCALL uint64

// gdt is the CPU's descriptor table.
gdt descriptorTable

Expand Down Expand Up @@ -180,6 +183,14 @@ func (c *CPU) FaultAddr() uintptr {
return c.faultAddr
}

func (c *CPU) EnableVMCALL() {
c.enableVMCALL = 1
}

func (c *CPU) DisableVMCALL() {
c.enableVMCALL = 0
}

// SwitchArchOpts are embedded in SwitchOpts.
type SwitchArchOpts struct {
// UserPCID indicates that the application PCID to be used on switch,
Expand Down
55 changes: 55 additions & 0 deletions pkg/ring0/entry_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop
#define ENTRY_CPU_SELF 272 // +checkoffset . kernelEntry.cpuSelf
#define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3
#define ENTRY_ENABLE_VMCALL 288 // +checkoffset . kernelEntry.enableVMCALL

// Bits.
#define _RFLAGS_IF 512 // +checkconst . _RFLAGS_IF
Expand Down Expand Up @@ -64,6 +65,12 @@
#define SyscallInt80 128 // +checkconst . SyscallInt80
#define Syscall 256 // +checkconst . Syscall

#define SyscallExit 60 // +checkconst . SyscallExit
#define SyscallExitGroup 231 // +checkconst . SyscallExitGroup
#define SyscallRedPill 4294967295 // +checkconst . SyscallRedPill

#define CPUIntel 0 // +checkconst . CPUIntel

#define PTRACE_R15 0 // +checkoffset linux PtraceRegs.R15
#define PTRACE_R14 8 // +checkoffset linux PtraceRegs.R14
#define PTRACE_R13 16 // +checkoffset linux PtraceRegs.R13
Expand Down Expand Up @@ -160,6 +167,15 @@
#define LOAD_KERNEL_STACK(entry) \
MOVQ ENTRY_STACK_TOP(entry), SP;

// VMCALL do vmcall/vmmcal instruction
#define VMCALL() \
CMPQ ·CPUVendor(SB), $CPUIntel; \
JE 2(PC); \
JMP 5(PC); \ // vmmcall and vmcall will be treated as 3 independent instructions
BYTE $0x0F; BYTE $0x01; BYTE $0xC1; \
JMP 4(PC); \ // vmmcall and vmcall will be treated as 3 independent instructions
BYTE $0x0F; BYTE $0x01; BYTE $0xD9;

// ADDR_OF_FUNC defines a function named 'name' that returns the address of
// 'symbol'.
#define ADDR_OF_FUNC(name, symbol) \
Expand Down Expand Up @@ -488,6 +504,45 @@ sysenter_skip_gs:
RET

kernel:
// Handle any syscalls from GR0 in HR3 when EnableVMCALL is false.
// Currently there are 2 use cases:
// 1. Using KVM platform.
// 2. Upgrading SlimVM platform. This is one such method to return M to
// user mode (HR3) for upgrading platform.
CMPQ ENTRY_ENABLE_VMCALL(GS), $0
JE hr3_do_syscall

CMPQ AX, $SyscallRedPill
JE hr3_do_syscall

CMPQ AX, $SyscallExit
JE hr3_do_syscall

CMPQ AX, $SyscallExitGroup
JE hr3_do_syscall

vmcall:
// handle syscall from GR0 in host kernel
// copy from "handle system calls from G0" part of __dune_syscall in libdune/dune.S
PUSHQ R11
POPFQ

CMPQ AX, $158 // arch_prctl syscall
JNE 3(PC)
CMPQ DI, $0x1002 //ARCH_SET_FS
JE arch_prctl_vmcall

VMCALL()
JMP *CX

arch_prctl_vmcall:
VMCALL()
CMPQ AX, $0
JNE 2(PC)
MOVQ SI, CPU_REGISTERS+PTRACE_FS_BASE(GS)
JMP *CX

hr3_do_syscall:
// We can't restore the original stack, but we can access the registers
// in the CPU state directly. No need for temporary juggling.
MOVQ AX, ENTRY_SCRATCH0(GS)
Expand Down
27 changes: 27 additions & 0 deletions pkg/ring0/kernel_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,30 @@ func startGo(c *CPU) {
func ReadCR2() uintptr {
return readCR2()
}

//go:noinline
//go:nosplit
func (c *CPU) PrefaultIDT() uint32 {
return c.kernel.globalIDT[0].bits[0] + c.kernel.globalIDT[_NR_INTERRUPTS-1].bits[3]
}

// SetCPUIDFaulting sets CPUID faulting per the boolean value.
//
// True is returned if faulting could be set.
//
//go:nosplit
func SetCPUIDFaulting(on bool) bool {
// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
features := rdmsr(_MSR_MISC_FEATURES)
if on {
features |= _MISC_FEATURE_CPUID_TRAP
} else {
features &^= _MISC_FEATURE_CPUID_TRAP
}
wrmsr(_MSR_MISC_FEATURES, features)
return true // Setting successful.
}
return false
}
18 changes: 18 additions & 0 deletions pkg/ring0/lib_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,21 @@ func InitDefault() {
cpuid.Initialize()
Init(cpuid.HostFeatureSet())
}

// DisableLA57 forces ring0 to behave as if the host CPU did not advertise
// 5-level paging: hasLA57 is cleared so CR4.LA57 stays 0, and the address-
// space sizes are clamped to a 4-level layout (48-bit VA, 2^47 userspace).
//
// Must be called after Init/InitDefault and before any vCPU loads CR4 or
// any PageTables are created. Use this if the platform's hardware-
// virtualization layer cannot follow a 5-level page table walk (e.g. an
// EPT implementation limited to 4 levels) regardless of host CPUID.
func DisableLA57() {
hasLA57 = false
if VirtualAddressBits > 48 {
VirtualAddressBits = 48
UserspaceSize = uintptr(1) << (VirtualAddressBits - 1)
MaximumUserAddress = (UserspaceSize - 1) &^ uintptr(hostarch.PageSize-1)
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
}
}
1 change: 1 addition & 0 deletions pkg/ring0/pagetables/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ go_library(
visibility = [
"//pkg/ring0:__subpackages__",
"//pkg/sentry/platform/kvm:__subpackages__",
"//pkg/sentry/platform/slimvm:__subpackages__",
],
deps = [
"//pkg/cpuid",
Expand Down
14 changes: 14 additions & 0 deletions pkg/ring0/pagetables/pagetables.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,3 +331,17 @@ func (p *PageTables) Lookup(addr hostarch.Addr, findFirst bool) (virtual hostarc
func (p *PageTables) MarkReadOnlyShared() {
p.readOnlyShared = true
}

// PrefaultRootTable touches the root table page to be sure that its physical
// page is mapped. The runtime allocator backs PTEs with plain Go heap pages
// (new(PTEs), no mlock / MAP_POPULATE / memfile pinning), so Linux can
// reclaim the root page under memory pressure. Touching it from sentry
// context right before SwitchToUser guarantees the page is resident when
// iret/sysret loads CR3, avoiding rare host page faults that have been
// observed to manifest as vCPU bounce stalls (state=7, userExits stuck).
//
//go:nosplit
//go:noinline
func (p *PageTables) PrefaultRootTable() PTE {
return p.root[0]
}
16 changes: 15 additions & 1 deletion pkg/ring0/pagetables/pagetables_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,22 @@ var (
pgdShift = 39
pgdMask uintptr = 0x1ff << pgdShift
pgdSize uintptr = 1 << pgdShift

// la57Enabled gates whether InitArch promotes new PageTables to a
// 5-level layout when the host CPU advertises LA57. Defaults to true
// so existing behavior is unchanged; platforms whose hardware-
// virtualization layer cannot walk a 5-level page table call
// DisableLA57 once at startup to force 4-level.
la57Enabled = true
)

// DisableLA57 forces all subsequently-created PageTables to use a 4-level
// layout regardless of host CPUID. Must be called before any PageTables is
// created.
func DisableLA57() {
la57Enabled = false
}

const (
pteShift = 12
pmdShift = 21
Expand Down Expand Up @@ -54,7 +68,7 @@ const (
//go:nosplit
func (p *PageTables) InitArch(allocator Allocator) {
featureSet := cpuid.HostFeatureSet()
if featureSet.HasFeature(cpuid.X86FeatureLA57) {
if la57Enabled && featureSet.HasFeature(cpuid.X86FeatureLA57) {
p.largeAddressesEnabled = true
lowerTop = 0x00FFFFFFFFFFFFFF
upperBottom = 0xFF00000000000000
Expand Down
Loading
Loading