diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 7f589ee8dd..f6d254b705 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -47,10 +48,12 @@ func failCloneAfterTaskCreation(nt *Task) { // for group signal delivery, had children reparented to it, etc. // Thus we can't just drop it on the floor. Instead, instruct the // task goroutine to exit immediately, as quietly as possible. + nt.k.tasks.mu.Lock() nt.exitTracerNotified = true nt.exitTracerAcked = true nt.exitParentNotified = true nt.exitParentAcked = true + nt.k.tasks.mu.Unlock() nt.runState = (*runExitMain)(nil) } @@ -325,15 +328,10 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { cfg.InheritParent = t } if args.Flags&linux.CLONE_PIDFD != 0 { - if args.Flags&linux.CLONE_THREAD != 0 { - cfg.pidFDMode = makeThreadedPIDFD - } else { - cfg.pidFDMode = makePIDFD - } - cfg.pidFDAddr = hostarch.Addr(args.Pidfd) + cfg.makePIDFD = true } - nt, pidFD, err := t.tg.pidns.owner.cloneNewTask(t, cfg) - // If cloneNewTask succeeds, we transfer references to nt. If cloneNewTask fails, it does + nt, err := t.tg.pidns.owner.NewTask(t, cfg) + // If NewTask succeeds, we transfer references to nt. If cloneNewTask fails, it does // the cleanup for us. cu.Release() if err != nil { @@ -357,6 +355,31 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) + var pidFD int32 = -1 + if args.Flags&linux.CLONE_PIDFD != 0 { + isThread := args.Flags&linux.CLONE_THREAD != 0 + vfsfd, err := t.pidFDOpen(nt.pid, isThread, false /*nonblock*/) + if err != nil { + failCloneAfterTaskCreation(nt) + return 0, nil, err + } + defer vfsfd.DecRef(t) + + fd, err := t.NewFDFrom(0, vfsfd, FDFlags{CloseOnExec: true}) + if err != nil { + failCloneAfterTaskCreation(nt) + return 0, nil, err + } + pidFD = fd + if _, err := primitive.CopyInt32Out(t, hostarch.Addr(args.Pidfd), fd); err != nil { + failCloneAfterTaskCreation(nt) + if oldFile := t.FDTable().Remove(t, fd); oldFile != nil { + oldFile.DecRef(t) + } + return 0, nil, err + } + } + if seccheck.Global.Enabled(seccheck.PointClone) { mask, info := getCloneSeccheckInfo(t, nt, args.Flags) if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index a630296e16..73ce8e3555 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" @@ -33,13 +32,6 @@ import ( ) // pidFDMode is an enum that clone3 uses to support CLONE_PIDFD. -type pidFDMode int - -const ( - dontMakePIDFD pidFDMode = iota - makePIDFD - makeThreadedPIDFD -) // TaskConfig defines the configuration of a new Task (see below). type TaskConfig struct { @@ -126,9 +118,8 @@ type TaskConfig struct { // Origin indicates the origins of the new task. Origin TaskOrigin - // Used by clone3 to support CLONE_PIDFD. - pidFDMode pidFDMode - pidFDAddr hostarch.Addr + // used by clone(CLONE_PIDFD) + makePIDFD bool } // NewTask creates a new task defined by cfg. @@ -138,13 +129,6 @@ type TaskConfig struct { // If successful, NewTask transfers references held by cfg to the new task. // Otherwise, NewTask releases them. func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { - t, _, err := ts.cloneNewTask(ctx, cfg) - return t, err -} - -// cloneNewTask is a version of NewTask that the clone() syscall uses. -// Unlike NewTask, cloneNewTask will create and return a pidFD if requested. -func (ts *TaskSet) cloneNewTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, error) { var err error cleanup := func() { cfg.TaskImage.release(ctx) @@ -160,20 +144,20 @@ func (ts *TaskSet) cloneNewTask(ctx context.Context, cfg *TaskConfig) (*Task, in } if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { cleanup() - return nil, -1, err + return nil, err } - t, fd, err := ts.newTask(ctx, cfg) + t, err := ts.newTask(ctx, cfg) if err != nil { cfg.UserCounters.decRLimitNProc() cleanup() - return nil, -1, err + return nil, err } - return t, fd, nil + return t, nil } // newTask is a helper for TaskSet.NewTask that only takes ownership of parts // of cfg if it succeeds. -func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, error) { +func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { srcT := TaskFromContext(ctx) tg := cfg.ThreadGroup image := cfg.TaskImage @@ -220,33 +204,9 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, var cu cleanup.Cleanup defer cu.Clean() - pidFD := int32(-1) - - if cfg.pidFDMode != dontMakePIDFD { - isThread := cfg.pidFDMode == makeThreadedPIDFD + if cfg.makePIDFD { t.pid = &pid{} t.pid.t.Store(t) - - vfsfd, err := srcT.pidFDOpen(t.pid, isThread, false /*nonblock*/) - if err != nil { - return nil, -1, err - } - defer vfsfd.DecRef(t) - - fd, err := srcT.NewFDFrom(0, vfsfd, FDFlags{CloseOnExec: true}) - if err != nil { - return nil, -1, err - } - pidFD = fd - cu.Add(func() { - if oldFile := srcT.FDTable().Remove(ctx, fd); oldFile != nil { - oldFile.DecRef(t) - } - }) - - if _, err := primitive.CopyUint32Out(srcT, cfg.pidFDAddr, uint32(fd)); err != nil { - return nil, -1, err - } } var ( @@ -264,7 +224,7 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, if srcT != nil { var err error if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { - return nil, -1, err + return nil, err } if charged { defer func() { @@ -302,16 +262,16 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, // doesn't matter too much since the caller will exit before it returns // to userspace. If the caller isn't in the same thread group, then // we're in uncharted territory and can return whatever we want. - return nil, -1, linuxerr.EINTR + return nil, linuxerr.EINTR } if ts.liveTasks == 0 && ts.noNewTasksIfZeroLive { // Since liveTasks == 0, our caller cannot be a task goroutine invoking // a syscall, so it's safe to return a non-errno error that is more // explanatory. - return nil, -1, fmt.Errorf("task creation disabled after Kernel.WaitExited() may have returned") + return nil, fmt.Errorf("task creation disabled after Kernel.WaitExited() may have returned") } if err := ts.assignTIDsLocked(t); err != nil { - return nil, -1, err + return nil, err } // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). @@ -375,7 +335,7 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, int32, t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) cu.Release() - return t, pidFD, nil + return t, nil } // assignTIDsLocked ensures that new task t is visible in all PID namespaces in diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 488ed5721d..dd487a7b3d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -5055,6 +5055,7 @@ cc_binary( "//test/util:memory_util", "//test/util:multiprocess_util", "//test/util:posix_error", + "//test/util:save_util", "//test/util:test_main", "//test/util:test_util", "//test/util:thread_util", diff --git a/test/syscalls/linux/pidfd_test.cc b/test/syscalls/linux/pidfd_test.cc index a9d695d887..4e25ab7208 100644 --- a/test/syscalls/linux/pidfd_test.cc +++ b/test/syscalls/linux/pidfd_test.cc @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -50,6 +51,7 @@ #include "test/util/memory_util.h" #include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" +#include "test/util/save_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -320,17 +322,35 @@ TEST(PidfdTest, SendSignalViaPidfdOpenPidfdToDeadChildFails) { } TEST(PidfdTest, Clone3InvalidFdAddrFails) { - auto mapping = - ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE)); + // Unmapped or PROT_NONE pointer EFAULTs. + { + auto mapping = + ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE)); + + clone_args ca = {}; + ca.flags = CLONE_PIDFD; + ca.exit_signal = SIGCHLD; + ca.pidfd = static_cast(mapping.addr()); + + pid_t child = syscall(SYS_clone3, &ca, sizeof(ca)); + ASSERT_LT(child, 0); + EXPECT_EQ(errno, EFAULT); + } - clone_args ca = {}; - ca.flags = CLONE_PIDFD; - ca.exit_signal = SIGCHLD; - ca.pidfd = static_cast(mapping.addr()); + // Read-only pointer EFAULTs. + { + auto mapping = + ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_READ, MAP_PRIVATE)); + + clone_args ca = {}; + ca.flags = CLONE_PIDFD; + ca.exit_signal = SIGCHLD; + ca.pidfd = static_cast(mapping.addr()); - pid_t child = syscall(SYS_clone3, &ca, sizeof(ca)); - ASSERT_LT(child, 0); - EXPECT_EQ(errno, EFAULT); + pid_t child = syscall(SYS_clone3, &ca, sizeof(ca)); + ASSERT_LT(child, 0); + EXPECT_EQ(errno, EFAULT); + } } TEST(PidfdTest, SendSignalToDeadChildFails) { @@ -1212,6 +1232,59 @@ TEST(PidfdTest, FasyncIsUnsupported) { SyscallFailsWithErrno(want_errno)); } +// This reproduces the race condition where pidfd_send_signal can target +// a cloned task whose struct task is initialized but has not +// yet completed `updateInfoLocked()` (initializing `logPrefix`), triggering +// a nil pointer dereference on the target task's Debugf/Infof logging paths. +TEST(PidfdTest, SendSignalToStartingTaskRace) { + DisableSave ds; // Violent test that hopes to trigger a crash. + std::atomic shared_pidfd{-1}; + std::atomic stop_signal_thread{false}; + + // These threads will try to race with clone(CLONE_PIDFD) + // to get at the pidfd FD before the corresponding task is + // fully initialized. + std::unique_ptr signallers[4]; + for (int i = 0; i < 4; ++i) { + signallers[i] = + std::make_unique([&shared_pidfd, &stop_signal_thread]() { + while (!stop_signal_thread.load(std::memory_order_relaxed)) { + int fd = shared_pidfd.load(std::memory_order_relaxed); + if (fd != -1) { + syscall(SYS_pidfd_send_signal, fd, SIGUSR1, nullptr, 0); + } + } + }); + } + + for (int iter = 0; iter < 20; ++iter) { + shared_pidfd.store(-1, std::memory_order_relaxed); + + clone_args ca = {}; + ca.flags = CLONE_PIDFD; + ca.exit_signal = SIGCHLD; + ca.pidfd = reinterpret_cast(&shared_pidfd); + + pid_t child = syscall(SYS_clone3, &ca, sizeof(ca)); + if (child == 0) { + // Child immediately exits. + _exit(0); + } + ASSERT_GT(child, 0); + + int status; + EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), + SyscallSucceedsWithValue(child)); + + int fd = shared_pidfd.load(std::memory_order_relaxed); + if (fd != -1) { + close(fd); + } + } + + stop_signal_thread.store(true, std::memory_order_relaxed); +} + } // namespace } // namespace testing } // namespace gvisor