| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821 |
- package libcontainer
- import (
- "encoding/json"
- "errors"
- "fmt"
- "io"
- "net"
- "os"
- "os/exec"
- "path/filepath"
- "strconv"
- "time"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
- "github.com/opencontainers/runc/libcontainer/configs"
- "github.com/opencontainers/runc/libcontainer/intelrdt"
- "github.com/opencontainers/runc/libcontainer/logs"
- "github.com/opencontainers/runc/libcontainer/system"
- "github.com/opencontainers/runc/libcontainer/utils"
- "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/sirupsen/logrus"
- "golang.org/x/sys/unix"
- )
- type parentProcess interface {
- // pid returns the pid for the running process.
- pid() int
- // start starts the process execution.
- start() error
- // send a SIGKILL to the process and wait for the exit.
- terminate() error
- // wait waits on the process returning the process state.
- wait() (*os.ProcessState, error)
- // startTime returns the process start time.
- startTime() (uint64, error)
- signal(os.Signal) error
- externalDescriptors() []string
- setExternalDescriptors(fds []string)
- forwardChildLogs() chan error
- }
- type filePair struct {
- parent *os.File
- child *os.File
- }
- type setnsProcess struct {
- cmd *exec.Cmd
- messageSockPair filePair
- logFilePair filePair
- cgroupPaths map[string]string
- rootlessCgroups bool
- manager cgroups.Manager
- intelRdtPath string
- config *initConfig
- fds []string
- process *Process
- bootstrapData io.Reader
- initProcessPid int
- }
- func (p *setnsProcess) startTime() (uint64, error) {
- stat, err := system.Stat(p.pid())
- return stat.StartTime, err
- }
- func (p *setnsProcess) signal(sig os.Signal) error {
- s, ok := sig.(unix.Signal)
- if !ok {
- return errors.New("os: unsupported signal type")
- }
- return unix.Kill(p.pid(), s)
- }
- func (p *setnsProcess) start() (retErr error) {
- defer p.messageSockPair.parent.Close()
- // get the "before" value of oom kill count
- oom, _ := p.manager.OOMKillCount()
- err := p.cmd.Start()
- // close the write-side of the pipes (controlled by child)
- p.messageSockPair.child.Close()
- p.logFilePair.child.Close()
- if err != nil {
- return fmt.Errorf("error starting setns process: %w", err)
- }
- waitInit := initWaiter(p.messageSockPair.parent)
- defer func() {
- if retErr != nil {
- if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
- // Someone in this cgroup was killed, this _might_ be us.
- retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
- }
- werr := <-waitInit
- if werr != nil {
- logrus.WithError(werr).Warn()
- }
- err := ignoreTerminateErrors(p.terminate())
- if err != nil {
- logrus.WithError(err).Warn("unable to terminate setnsProcess")
- }
- }
- }()
- if p.bootstrapData != nil {
- if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
- return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
- }
- }
- err = <-waitInit
- if err != nil {
- return err
- }
- if err := p.execSetns(); err != nil {
- return fmt.Errorf("error executing setns process: %w", err)
- }
- for _, path := range p.cgroupPaths {
- if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
- // On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
- // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
- // Try to join the cgroup of InitProcessPid.
- if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
- initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
- initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
- if initCgErr == nil {
- if initCgPath, ok := initCg[""]; ok {
- initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
- logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
- p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
- // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
- err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
- }
- }
- }
- if err != nil {
- return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
- }
- }
- }
- if p.intelRdtPath != "" {
- // if Intel RDT "resource control" filesystem path exists
- _, err := os.Stat(p.intelRdtPath)
- if err == nil {
- if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
- return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
- }
- }
- }
- // set rlimits, this has to be done here because we lose permissions
- // to raise the limits once we enter a user-namespace
- if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
- return fmt.Errorf("error setting rlimits for process: %w", err)
- }
- if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
- return fmt.Errorf("error writing config to pipe: %w", err)
- }
- ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
- switch sync.Type {
- case procReady:
- // This shouldn't happen.
- panic("unexpected procReady in setns")
- case procHooks:
- // This shouldn't happen.
- panic("unexpected procHooks in setns")
- case procSeccomp:
- if p.config.Config.Seccomp.ListenerPath == "" {
- return errors.New("listenerPath is not set")
- }
- seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
- if err != nil {
- return err
- }
- defer unix.Close(seccompFd)
- bundle, annotations := utils.Annotations(p.config.Config.Labels)
- containerProcessState := &specs.ContainerProcessState{
- Version: specs.Version,
- Fds: []string{specs.SeccompFdName},
- Pid: p.cmd.Process.Pid,
- Metadata: p.config.Config.Seccomp.ListenerMetadata,
- State: specs.State{
- Version: specs.Version,
- ID: p.config.ContainerId,
- Status: specs.StateRunning,
- Pid: p.initProcessPid,
- Bundle: bundle,
- Annotations: annotations,
- },
- }
- if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
- containerProcessState, seccompFd); err != nil {
- return err
- }
- // Sync with child.
- if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
- return err
- }
- return nil
- default:
- return errors.New("invalid JSON payload from child")
- }
- })
- if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
- return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
- }
- // Must be done after Shutdown so the child will exit and we can wait for it.
- if ierr != nil {
- _, _ = p.wait()
- return ierr
- }
- return nil
- }
- // execSetns runs the process that executes C code to perform the setns calls
- // because setns support requires the C process to fork off a child and perform the setns
- // before the go runtime boots, we wait on the process to die and receive the child's pid
- // over the provided pipe.
- func (p *setnsProcess) execSetns() error {
- status, err := p.cmd.Process.Wait()
- if err != nil {
- _ = p.cmd.Wait()
- return fmt.Errorf("error waiting on setns process to finish: %w", err)
- }
- if !status.Success() {
- _ = p.cmd.Wait()
- return &exec.ExitError{ProcessState: status}
- }
- var pid *pid
- if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
- _ = p.cmd.Wait()
- return fmt.Errorf("error reading pid from init pipe: %w", err)
- }
- // Clean up the zombie parent process
- // On Unix systems FindProcess always succeeds.
- firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
- // Ignore the error in case the child has already been reaped for any reason
- _, _ = firstChildProcess.Wait()
- process, err := os.FindProcess(pid.Pid)
- if err != nil {
- return err
- }
- p.cmd.Process = process
- p.process.ops = p
- return nil
- }
- // terminate sends a SIGKILL to the forked process for the setns routine then waits to
- // avoid the process becoming a zombie.
- func (p *setnsProcess) terminate() error {
- if p.cmd.Process == nil {
- return nil
- }
- err := p.cmd.Process.Kill()
- if _, werr := p.wait(); err == nil {
- err = werr
- }
- return err
- }
- func (p *setnsProcess) wait() (*os.ProcessState, error) {
- err := p.cmd.Wait()
- // Return actual ProcessState even on Wait error
- return p.cmd.ProcessState, err
- }
- func (p *setnsProcess) pid() int {
- return p.cmd.Process.Pid
- }
- func (p *setnsProcess) externalDescriptors() []string {
- return p.fds
- }
- func (p *setnsProcess) setExternalDescriptors(newFds []string) {
- p.fds = newFds
- }
- func (p *setnsProcess) forwardChildLogs() chan error {
- return logs.ForwardLogs(p.logFilePair.parent)
- }
- type initProcess struct {
- cmd *exec.Cmd
- messageSockPair filePair
- logFilePair filePair
- config *initConfig
- manager cgroups.Manager
- intelRdtManager intelrdt.Manager
- container *linuxContainer
- fds []string
- process *Process
- bootstrapData io.Reader
- sharePidns bool
- }
- func (p *initProcess) pid() int {
- return p.cmd.Process.Pid
- }
- func (p *initProcess) externalDescriptors() []string {
- return p.fds
- }
- // getChildPid receives the final child's pid over the provided pipe.
- func (p *initProcess) getChildPid() (int, error) {
- var pid pid
- if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
- _ = p.cmd.Wait()
- return -1, err
- }
- // Clean up the zombie parent process
- // On Unix systems FindProcess always succeeds.
- firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
- // Ignore the error in case the child has already been reaped for any reason
- _, _ = firstChildProcess.Wait()
- return pid.Pid, nil
- }
- func (p *initProcess) waitForChildExit(childPid int) error {
- status, err := p.cmd.Process.Wait()
- if err != nil {
- _ = p.cmd.Wait()
- return err
- }
- if !status.Success() {
- _ = p.cmd.Wait()
- return &exec.ExitError{ProcessState: status}
- }
- process, err := os.FindProcess(childPid)
- if err != nil {
- return err
- }
- p.cmd.Process = process
- p.process.ops = p
- return nil
- }
- func (p *initProcess) start() (retErr error) {
- defer p.messageSockPair.parent.Close() //nolint: errcheck
- err := p.cmd.Start()
- p.process.ops = p
- // close the write-side of the pipes (controlled by child)
- _ = p.messageSockPair.child.Close()
- _ = p.logFilePair.child.Close()
- if err != nil {
- p.process.ops = nil
- return fmt.Errorf("unable to start init: %w", err)
- }
- waitInit := initWaiter(p.messageSockPair.parent)
- defer func() {
- if retErr != nil {
- // Find out if init is killed by the kernel's OOM killer.
- // Get the count before killing init as otherwise cgroup
- // might be removed by systemd.
- oom, err := p.manager.OOMKillCount()
- if err != nil {
- logrus.WithError(err).Warn("unable to get oom kill count")
- } else if oom > 0 {
- // Does not matter what the particular error was,
- // its cause is most probably OOM, so report that.
- const oomError = "container init was OOM-killed (memory limit too low?)"
- if logrus.GetLevel() >= logrus.DebugLevel {
- // Only show the original error if debug is set,
- // as it is not generally very useful.
- retErr = fmt.Errorf(oomError+": %w", retErr)
- } else {
- retErr = errors.New(oomError)
- }
- }
- werr := <-waitInit
- if werr != nil {
- logrus.WithError(werr).Warn()
- }
- // Terminate the process to ensure we can remove cgroups.
- if err := ignoreTerminateErrors(p.terminate()); err != nil {
- logrus.WithError(err).Warn("unable to terminate initProcess")
- }
- _ = p.manager.Destroy()
- if p.intelRdtManager != nil {
- _ = p.intelRdtManager.Destroy()
- }
- }
- }()
- // Do this before syncing with child so that no children can escape the
- // cgroup. We don't need to worry about not doing this and not being root
- // because we'd be using the rootless cgroup manager in that case.
- if err := p.manager.Apply(p.pid()); err != nil {
- return fmt.Errorf("unable to apply cgroup configuration: %w", err)
- }
- if p.intelRdtManager != nil {
- if err := p.intelRdtManager.Apply(p.pid()); err != nil {
- return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
- }
- }
- if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
- return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
- }
- err = <-waitInit
- if err != nil {
- return err
- }
- childPid, err := p.getChildPid()
- if err != nil {
- return fmt.Errorf("can't get final child's PID from pipe: %w", err)
- }
- // Save the standard descriptor names before the container process
- // can potentially move them (e.g., via dup2()). If we don't do this now,
- // we won't know at checkpoint time which file descriptor to look up.
- fds, err := getPipeFds(childPid)
- if err != nil {
- return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
- }
- p.setExternalDescriptors(fds)
- // Wait for our first child to exit
- if err := p.waitForChildExit(childPid); err != nil {
- return fmt.Errorf("error waiting for our first child to exit: %w", err)
- }
- if err := p.createNetworkInterfaces(); err != nil {
- return fmt.Errorf("error creating network interfaces: %w", err)
- }
- if err := p.updateSpecState(); err != nil {
- return fmt.Errorf("error updating spec state: %w", err)
- }
- if err := p.sendConfig(); err != nil {
- return fmt.Errorf("error sending config to init process: %w", err)
- }
- var (
- sentRun bool
- sentResume bool
- )
- ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
- switch sync.Type {
- case procSeccomp:
- if p.config.Config.Seccomp.ListenerPath == "" {
- return errors.New("listenerPath is not set")
- }
- seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
- if err != nil {
- return err
- }
- defer unix.Close(seccompFd)
- s, err := p.container.currentOCIState()
- if err != nil {
- return err
- }
- // initProcessStartTime hasn't been set yet.
- s.Pid = p.cmd.Process.Pid
- s.Status = specs.StateCreating
- containerProcessState := &specs.ContainerProcessState{
- Version: specs.Version,
- Fds: []string{specs.SeccompFdName},
- Pid: s.Pid,
- Metadata: p.config.Config.Seccomp.ListenerMetadata,
- State: *s,
- }
- if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
- containerProcessState, seccompFd); err != nil {
- return err
- }
- // Sync with child.
- if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
- return err
- }
- case procReady:
- // set rlimits, this has to be done here because we lose permissions
- // to raise the limits once we enter a user-namespace
- if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
- return fmt.Errorf("error setting rlimits for ready process: %w", err)
- }
- // call prestart and CreateRuntime hooks
- if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
- // Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
- if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
- return fmt.Errorf("error setting cgroup config for ready process: %w", err)
- }
- if p.intelRdtManager != nil {
- if err := p.intelRdtManager.Set(p.config.Config); err != nil {
- return fmt.Errorf("error setting Intel RDT config for ready process: %w", err)
- }
- }
- if len(p.config.Config.Hooks) != 0 {
- s, err := p.container.currentOCIState()
- if err != nil {
- return err
- }
- // initProcessStartTime hasn't been set yet.
- s.Pid = p.cmd.Process.Pid
- s.Status = specs.StateCreating
- hooks := p.config.Config.Hooks
- if err := hooks[configs.Prestart].RunHooks(s); err != nil {
- return err
- }
- if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
- return err
- }
- }
- }
- // generate a timestamp indicating when the container was started
- p.container.created = time.Now().UTC()
- p.container.state = &createdState{
- c: p.container,
- }
- // NOTE: If the procRun state has been synced and the
- // runc-create process has been killed for some reason,
- // the runc-init[2:stage] process will be leaky. And
- // the runc command also fails to parse root directory
- // because the container doesn't have state.json.
- //
- // In order to cleanup the runc-init[2:stage] by
- // runc-delete/stop, we should store the status before
- // procRun sync.
- state, uerr := p.container.updateState(p)
- if uerr != nil {
- return fmt.Errorf("unable to store init state: %w", err)
- }
- p.container.initProcessStartTime = state.InitProcessStartTime
- // Sync with child.
- if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
- return err
- }
- sentRun = true
- case procHooks:
- // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
- if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
- return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
- }
- if p.intelRdtManager != nil {
- if err := p.intelRdtManager.Set(p.config.Config); err != nil {
- return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
- }
- }
- if len(p.config.Config.Hooks) != 0 {
- s, err := p.container.currentOCIState()
- if err != nil {
- return err
- }
- // initProcessStartTime hasn't been set yet.
- s.Pid = p.cmd.Process.Pid
- s.Status = specs.StateCreating
- hooks := p.config.Config.Hooks
- if err := hooks[configs.Prestart].RunHooks(s); err != nil {
- return err
- }
- if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
- return err
- }
- }
- // Sync with child.
- if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
- return err
- }
- sentResume = true
- default:
- return errors.New("invalid JSON payload from child")
- }
- return nil
- })
- if !sentRun {
- return fmt.Errorf("error during container init: %w", ierr)
- }
- if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
- return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process")
- }
- if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
- return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
- }
- // Must be done after Shutdown so the child will exit and we can wait for it.
- if ierr != nil {
- _, _ = p.wait()
- return ierr
- }
- return nil
- }
- func (p *initProcess) wait() (*os.ProcessState, error) {
- err := p.cmd.Wait()
- // we should kill all processes in cgroup when init is died if we use host PID namespace
- if p.sharePidns {
- _ = signalAllProcesses(p.manager, unix.SIGKILL)
- }
- return p.cmd.ProcessState, err
- }
- func (p *initProcess) terminate() error {
- if p.cmd.Process == nil {
- return nil
- }
- err := p.cmd.Process.Kill()
- if _, werr := p.wait(); err == nil {
- err = werr
- }
- return err
- }
- func (p *initProcess) startTime() (uint64, error) {
- stat, err := system.Stat(p.pid())
- return stat.StartTime, err
- }
- func (p *initProcess) updateSpecState() error {
- s, err := p.container.currentOCIState()
- if err != nil {
- return err
- }
- p.config.SpecState = s
- return nil
- }
- func (p *initProcess) sendConfig() error {
- // send the config to the container's init process, we don't use JSON Encode
- // here because there might be a problem in JSON decoder in some cases, see:
- // https://github.com/docker/docker/issues/14203#issuecomment-174177790
- return utils.WriteJSON(p.messageSockPair.parent, p.config)
- }
- func (p *initProcess) createNetworkInterfaces() error {
- for _, config := range p.config.Config.Networks {
- strategy, err := getStrategy(config.Type)
- if err != nil {
- return err
- }
- n := &network{
- Network: *config,
- }
- if err := strategy.create(n, p.pid()); err != nil {
- return err
- }
- p.config.Networks = append(p.config.Networks, n)
- }
- return nil
- }
- func (p *initProcess) signal(sig os.Signal) error {
- s, ok := sig.(unix.Signal)
- if !ok {
- return errors.New("os: unsupported signal type")
- }
- return unix.Kill(p.pid(), s)
- }
- func (p *initProcess) setExternalDescriptors(newFds []string) {
- p.fds = newFds
- }
- func (p *initProcess) forwardChildLogs() chan error {
- return logs.ForwardLogs(p.logFilePair.parent)
- }
- func recvSeccompFd(childPid, childFd uintptr) (int, error) {
- pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
- if errno != 0 {
- return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
- }
- defer unix.Close(int(pidfd))
- seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
- if errno != 0 {
- return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
- }
- return int(seccompFd), nil
- }
- func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
- conn, err := net.Dial("unix", listenerPath)
- if err != nil {
- return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
- }
- socket, err := conn.(*net.UnixConn).File()
- if err != nil {
- return fmt.Errorf("cannot get seccomp socket: %w", err)
- }
- defer socket.Close()
- b, err := json.Marshal(state)
- if err != nil {
- return fmt.Errorf("cannot marshall seccomp state: %w", err)
- }
- err = utils.SendFds(socket, b, fd)
- if err != nil {
- return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
- }
- return nil
- }
- func getPipeFds(pid int) ([]string, error) {
- fds := make([]string, 3)
- dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
- for i := 0; i < 3; i++ {
- // XXX: This breaks if the path is not a valid symlink (which can
- // happen in certain particularly unlucky mount namespace setups).
- f := filepath.Join(dirPath, strconv.Itoa(i))
- target, err := os.Readlink(f)
- if err != nil {
- // Ignore permission errors, for rootless containers and other
- // non-dumpable processes. if we can't get the fd for a particular
- // file, there's not much we can do.
- if os.IsPermission(err) {
- continue
- }
- return fds, err
- }
- fds[i] = target
- }
- return fds, nil
- }
- // InitializeIO creates pipes for use with the process's stdio and returns the
- // opposite side for each. Do not use this if you want to have a pseudoterminal
- // set up for you by libcontainer (TODO: fix that too).
- // TODO: This is mostly unnecessary, and should be handled by clients.
- func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
- var fds []uintptr
- i = &IO{}
- // cleanup in case of an error
- defer func() {
- if err != nil {
- for _, fd := range fds {
- _ = unix.Close(int(fd))
- }
- }
- }()
- // STDIN
- r, w, err := os.Pipe()
- if err != nil {
- return nil, err
- }
- fds = append(fds, r.Fd(), w.Fd())
- p.Stdin, i.Stdin = r, w
- // STDOUT
- if r, w, err = os.Pipe(); err != nil {
- return nil, err
- }
- fds = append(fds, r.Fd(), w.Fd())
- p.Stdout, i.Stdout = w, r
- // STDERR
- if r, w, err = os.Pipe(); err != nil {
- return nil, err
- }
- fds = append(fds, r.Fd(), w.Fd())
- p.Stderr, i.Stderr = w, r
- // change ownership of the pipes in case we are in a user namespace
- for _, fd := range fds {
- if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
- return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
- }
- }
- return i, nil
- }
- // initWaiter returns a channel to wait on for making sure
- // runc init has finished the initial setup.
- func initWaiter(r io.Reader) chan error {
- ch := make(chan error, 1)
- go func() {
- defer close(ch)
- inited := make([]byte, 1)
- n, err := r.Read(inited)
- if err == nil {
- if n < 1 {
- err = errors.New("short read")
- } else if inited[0] != 0 {
- err = fmt.Errorf("unexpected %d != 0", inited[0])
- } else {
- ch <- nil
- return
- }
- }
- ch <- fmt.Errorf("waiting for init preliminary setup: %w", err)
- }()
- return ch
- }
|