Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 38 additions & 8 deletions pkg/sentry/kernel/task_clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
Expand Down Expand Up @@ -47,10 +48,12 @@ func failCloneAfterTaskCreation(nt *Task) {
// for group signal delivery, had children reparented to it, etc.
// Thus we can't just drop it on the floor. Instead, instruct the
// task goroutine to exit immediately, as quietly as possible.
nt.k.tasks.mu.Lock()
nt.exitTracerNotified = true
nt.exitTracerAcked = true
nt.exitParentNotified = true
nt.exitParentAcked = true
nt.k.tasks.mu.Unlock()
nt.runState = (*runExitMain)(nil)
}

Expand Down Expand Up @@ -109,6 +112,13 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
return 0, nil, linuxerr.EINVAL
}

if args.Flags&linux.CLONE_PIDFD != 0 {
// Verify that the pidfd address is writable before dedicating any effort.
if err := t.MemoryManager().CheckWritable(t, hostarch.Addr(args.Pidfd), 4 /* sizeof(int32) */); err != nil {
return 0, nil, err
}
}

// Pull task registers and FPU state, a cloned task will inherit the
// state of the current task.
if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil {
Expand Down Expand Up @@ -325,15 +335,10 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
cfg.InheritParent = t
}
if args.Flags&linux.CLONE_PIDFD != 0 {
if args.Flags&linux.CLONE_THREAD != 0 {
cfg.pidFDMode = makeThreadedPIDFD
} else {
cfg.pidFDMode = makePIDFD
}
cfg.pidFDAddr = hostarch.Addr(args.Pidfd)
cfg.makePIDFD = true
}
nt, pidFD, err := t.tg.pidns.owner.cloneNewTask(t, cfg)
// If cloneNewTask succeeds, we transfer references to nt. If cloneNewTask fails, it does
nt, err := t.tg.pidns.owner.NewTask(t, cfg)
// If NewTask succeeds, we transfer references to nt. If cloneNewTask fails, it does
// the cleanup for us.
cu.Release()
if err != nil {
Expand All @@ -357,6 +362,31 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
tid := nt.k.tasks.Root.IDOfTask(nt)
defer nt.Start(tid)

var pidFD int32 = -1
if args.Flags&linux.CLONE_PIDFD != 0 {
isThread := args.Flags&linux.CLONE_THREAD != 0
vfsfd, err := t.pidFDOpen(nt.pid, isThread, false /*nonblock*/)
if err != nil {
failCloneAfterTaskCreation(nt)
return 0, nil, err
}
defer vfsfd.DecRef(t)

fd, err := t.NewFDFrom(0, vfsfd, FDFlags{CloseOnExec: true})
if err != nil {
failCloneAfterTaskCreation(nt)
return 0, nil, err
}
pidFD = fd
if _, err := primitive.CopyInt32Out(t, hostarch.Addr(args.Pidfd), fd); err != nil {
failCloneAfterTaskCreation(nt)
if oldFile := t.FDTable().Remove(t, fd); oldFile != nil {
oldFile.DecRef(t)
}
return 0, nil, err
}
}

if seccheck.Global.Enabled(seccheck.PointClone) {
mask, info := getCloneSeccheckInfo(t, nt, args.Flags)
if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
Expand Down
66 changes: 13 additions & 53 deletions pkg/sentry/kernel/task_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
Expand All @@ -33,13 +32,6 @@ import (
)

// pidFDMode is an enum that clone3 uses to support CLONE_PIDFD.
type pidFDMode int

const (
dontMakePIDFD pidFDMode = iota
makePIDFD
makeThreadedPIDFD
)

// TaskConfig defines the configuration of a new Task (see below).
type TaskConfig struct {
Expand Down Expand Up @@ -126,9 +118,8 @@ type TaskConfig struct {
// Origin indicates the origins of the new task.
Origin TaskOrigin

// Used by clone3 to support CLONE_PIDFD.
pidFDMode pidFDMode
pidFDAddr hostarch.Addr
// used by clone(CLONE_PIDFD)
makePIDFD bool
}

// NewTask creates a new task defined by cfg.
Expand All @@ -138,13 +129,6 @@ type TaskConfig struct {
// If successful, NewTask transfers references held by cfg to the new task.
// Otherwise, NewTask releases them.
func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
t, _, err := ts.cloneNewTask(ctx, cfg)
return t, err
}

// cloneNewTask is a version of NewTask that the clone() syscall uses.
// Unlike NewTask, cloneNewTask will create and return a pidFD if requested.
func (ts *TaskSet) cloneNewTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, error) {
var err error
cleanup := func() {
cfg.TaskImage.release(ctx)
Expand All @@ -160,20 +144,20 @@ func (ts *TaskSet) cloneNewTask(ctx context.Context, cfg *TaskConfig) (*Task, in
}
if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil {
cleanup()
return nil, -1, err
return nil, err
}
t, fd, err := ts.newTask(ctx, cfg)
t, err := ts.newTask(ctx, cfg)
if err != nil {
cfg.UserCounters.decRLimitNProc()
cleanup()
return nil, -1, err
return nil, err
}
return t, fd, nil
return t, nil
}

// newTask is a helper for TaskSet.NewTask that only takes ownership of parts
// of cfg if it succeeds.
func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, error) {
func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
srcT := TaskFromContext(ctx)
tg := cfg.ThreadGroup
image := cfg.TaskImage
Expand Down Expand Up @@ -220,33 +204,9 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32,
var cu cleanup.Cleanup
defer cu.Clean()

pidFD := int32(-1)

if cfg.pidFDMode != dontMakePIDFD {
isThread := cfg.pidFDMode == makeThreadedPIDFD
if cfg.makePIDFD {
t.pid = &pid{}
t.pid.t.Store(t)

vfsfd, err := srcT.pidFDOpen(t.pid, isThread, false /*nonblock*/)
if err != nil {
return nil, -1, err
}
defer vfsfd.DecRef(t)

fd, err := srcT.NewFDFrom(0, vfsfd, FDFlags{CloseOnExec: true})
if err != nil {
return nil, -1, err
}
pidFD = fd
cu.Add(func() {
if oldFile := srcT.FDTable().Remove(ctx, fd); oldFile != nil {
oldFile.DecRef(t)
}
})

if _, err := primitive.CopyUint32Out(srcT, cfg.pidFDAddr, uint32(fd)); err != nil {
return nil, -1, err
}
}

var (
Expand All @@ -264,7 +224,7 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32,
if srcT != nil {
var err error
if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil {
return nil, -1, err
return nil, err
}
if charged {
defer func() {
Expand Down Expand Up @@ -302,16 +262,16 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32,
// doesn't matter too much since the caller will exit before it returns
// to userspace. If the caller isn't in the same thread group, then
// we're in uncharted territory and can return whatever we want.
return nil, -1, linuxerr.EINTR
return nil, linuxerr.EINTR
}
if ts.liveTasks == 0 && ts.noNewTasksIfZeroLive {
// Since liveTasks == 0, our caller cannot be a task goroutine invoking
// a syscall, so it's safe to return a non-errno error that is more
// explanatory.
return nil, -1, fmt.Errorf("task creation disabled after Kernel.WaitExited() may have returned")
return nil, fmt.Errorf("task creation disabled after Kernel.WaitExited() may have returned")
}
if err := ts.assignTIDsLocked(t); err != nil {
return nil, -1, err
return nil, err
}
// Below this point, newTask is expected not to fail (there is no rollback
// of assignTIDsLocked or any of the following).
Expand Down Expand Up @@ -375,7 +335,7 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32,
t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext())

cu.Release()
return t, pidFD, nil
return t, nil
}

// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
Expand Down
19 changes: 19 additions & 0 deletions pkg/sentry/mm/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -975,3 +975,22 @@ func truncatedAddrRangeSeq(ars, arsit hostarch.AddrRangeSeq, end hostarch.Addr)
}
return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
}

// CheckWritable verifies that the virtual memory range starting at addr of the given
// length is writable in the task's address space. It returns EFAULT if the range
// is invalid or not writable.
//
// Preconditions: length >= 0.
func (mm *MemoryManager) CheckWritable(ctx context.Context, addr hostarch.Addr, length int64) error {
ar, ok := mm.CheckIORange(addr, length)
if !ok {
return linuxerr.EFAULT
}
if length == 0 {
return nil
}
_, err := mm.withInternalMappings(ctx, ar, hostarch.Write, false, func(ims safemem.BlockSeq) (uint64, error) {
return uint64(ar.Length()), nil
})
return err
}
1 change: 1 addition & 0 deletions test/syscalls/linux/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -5055,6 +5055,7 @@ cc_binary(
"//test/util:memory_util",
"//test/util:multiprocess_util",
"//test/util:posix_error",
"//test/util:save_util",
"//test/util:test_main",
"//test/util:test_util",
"//test/util:thread_util",
Expand Down
91 changes: 82 additions & 9 deletions test/syscalls/linux/pidfd_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

#include <atomic>
#include <cerrno>
#include <csignal>
#include <cstdint>
#include <cstring>
#include <memory>
Expand All @@ -50,6 +51,7 @@
#include "test/util/memory_util.h"
#include "test/util/multiprocess_util.h"
#include "test/util/posix_error.h"
#include "test/util/save_util.h"
#include "test/util/test_util.h"
#include "test/util/thread_util.h"

Expand Down Expand Up @@ -320,17 +322,35 @@ TEST(PidfdTest, SendSignalViaPidfdOpenPidfdToDeadChildFails) {
}

TEST(PidfdTest, Clone3InvalidFdAddrFails) {
auto mapping =
ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
// Unmapped or PROT_NONE pointer EFAULTs.
{
auto mapping =
ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));

clone_args ca = {};
ca.flags = CLONE_PIDFD;
ca.exit_signal = SIGCHLD;
ca.pidfd = static_cast<uint64_t>(mapping.addr());

pid_t child = syscall(SYS_clone3, &ca, sizeof(ca));
ASSERT_LT(child, 0);
EXPECT_EQ(errno, EFAULT);
}

clone_args ca = {};
ca.flags = CLONE_PIDFD;
ca.exit_signal = SIGCHLD;
ca.pidfd = static_cast<uint64_t>(mapping.addr());
// Read-only pointer EFAULTs.
{
auto mapping =
ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_READ, MAP_PRIVATE));

clone_args ca = {};
ca.flags = CLONE_PIDFD;
ca.exit_signal = SIGCHLD;
ca.pidfd = static_cast<uint64_t>(mapping.addr());

pid_t child = syscall(SYS_clone3, &ca, sizeof(ca));
ASSERT_LT(child, 0);
EXPECT_EQ(errno, EFAULT);
pid_t child = syscall(SYS_clone3, &ca, sizeof(ca));
ASSERT_LT(child, 0);
EXPECT_EQ(errno, EFAULT);
}
}

TEST(PidfdTest, SendSignalToDeadChildFails) {
Expand Down Expand Up @@ -1212,6 +1232,59 @@ TEST(PidfdTest, FasyncIsUnsupported) {
SyscallFailsWithErrno(want_errno));
}

// This reproduces the race condition where pidfd_send_signal can target
// a cloned task whose struct task is initialized but has not
// yet completed `updateInfoLocked()` (initializing `logPrefix`), triggering
// a nil pointer dereference on the target task's Debugf/Infof logging paths.
TEST(PidfdTest, SendSignalToStartingTaskRace) {
DisableSave ds; // Violent test that hopes to trigger a crash.
std::atomic<int> shared_pidfd{-1};
std::atomic<bool> stop_signal_thread{false};

// These threads will try to race with clone(CLONE_PIDFD)
// to get at the pidfd FD before the corresponding task is
// fully initialized.
std::unique_ptr<ScopedThread> signallers[4];
for (int i = 0; i < 4; ++i) {
signallers[i] =
std::make_unique<ScopedThread>([&shared_pidfd, &stop_signal_thread]() {
while (!stop_signal_thread.load(std::memory_order_relaxed)) {
int fd = shared_pidfd.load(std::memory_order_relaxed);
if (fd != -1) {
syscall(SYS_pidfd_send_signal, fd, SIGUSR1, nullptr, 0);
}
}
});
}

for (int iter = 0; iter < 20; ++iter) {
shared_pidfd.store(-1, std::memory_order_relaxed);

clone_args ca = {};
ca.flags = CLONE_PIDFD;
ca.exit_signal = SIGCHLD;
ca.pidfd = reinterpret_cast<uint64_t>(&shared_pidfd);

pid_t child = syscall(SYS_clone3, &ca, sizeof(ca));
if (child == 0) {
// Child immediately exits.
_exit(0);
}
ASSERT_GT(child, 0);

int status;
EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
SyscallSucceedsWithValue(child));

int fd = shared_pidfd.load(std::memory_order_relaxed);
if (fd != -1) {
close(fd);
}
}

stop_signal_thread.store(true, std::memory_order_relaxed);
}

} // namespace
} // namespace testing
} // namespace gvisor
Expand Down
Loading