standard_init_linux.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. package libcontainer
  2. import (
  3. "errors"
  4. "fmt"
  5. "os"
  6. "os/exec"
  7. "strconv"
  8. "github.com/opencontainers/runtime-spec/specs-go"
  9. "github.com/opencontainers/selinux/go-selinux"
  10. "github.com/sirupsen/logrus"
  11. "golang.org/x/sys/unix"
  12. "github.com/opencontainers/runc/libcontainer/apparmor"
  13. "github.com/opencontainers/runc/libcontainer/configs"
  14. "github.com/opencontainers/runc/libcontainer/keys"
  15. "github.com/opencontainers/runc/libcontainer/seccomp"
  16. "github.com/opencontainers/runc/libcontainer/system"
  17. )
  18. type linuxStandardInit struct {
  19. pipe *os.File
  20. consoleSocket *os.File
  21. parentPid int
  22. fifoFd int
  23. logFd int
  24. mountFds []int
  25. config *initConfig
  26. }
  27. func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
  28. var newperms uint32
  29. if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
  30. // With user ns we need 'other' search permissions.
  31. newperms = 0x8
  32. } else {
  33. // Without user ns we need 'UID' search permissions.
  34. newperms = 0x80000
  35. }
  36. // Create a unique per session container name that we can join in setns;
  37. // However, other containers can also join it.
  38. return "_ses." + l.config.ContainerId, 0xffffffff, newperms
  39. }
  40. func (l *linuxStandardInit) Init() error {
  41. if !l.config.Config.NoNewKeyring {
  42. if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
  43. return err
  44. }
  45. defer selinux.SetKeyLabel("") //nolint: errcheck
  46. ringname, keepperms, newperms := l.getSessionRingParams()
  47. // Do not inherit the parent's session keyring.
  48. if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
  49. // If keyrings aren't supported then it is likely we are on an
  50. // older kernel (or inside an LXC container). While we could bail,
  51. // the security feature we are using here is best-effort (it only
  52. // really provides marginal protection since VFS credentials are
  53. // the only significant protection of keyrings).
  54. //
  55. // TODO(cyphar): Log this so people know what's going on, once we
  56. // have proper logging in 'runc init'.
  57. if !errors.Is(err, unix.ENOSYS) {
  58. return fmt.Errorf("unable to join session keyring: %w", err)
  59. }
  60. } else {
  61. // Make session keyring searchable. If we've gotten this far we
  62. // bail on any error -- we don't want to have a keyring with bad
  63. // permissions.
  64. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
  65. return fmt.Errorf("unable to mod keyring permissions: %w", err)
  66. }
  67. }
  68. }
  69. if err := setupNetwork(l.config); err != nil {
  70. return err
  71. }
  72. if err := setupRoute(l.config.Config); err != nil {
  73. return err
  74. }
  75. // initialises the labeling system
  76. selinux.GetEnabled()
  77. // We don't need the mountFds after prepareRootfs() nor if it fails.
  78. err := prepareRootfs(l.pipe, l.config, l.mountFds)
  79. for _, m := range l.mountFds {
  80. if m == -1 {
  81. continue
  82. }
  83. if err := unix.Close(m); err != nil {
  84. return fmt.Errorf("Unable to close mountFds fds: %w", err)
  85. }
  86. }
  87. if err != nil {
  88. return err
  89. }
  90. // Set up the console. This has to be done *before* we finalize the rootfs,
  91. // but *after* we've given the user the chance to set up all of the mounts
  92. // they wanted.
  93. if l.config.CreateConsole {
  94. if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
  95. return err
  96. }
  97. if err := system.Setctty(); err != nil {
  98. return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err}
  99. }
  100. }
  101. // Finish the rootfs setup.
  102. if l.config.Config.Namespaces.Contains(configs.NEWNS) {
  103. if err := finalizeRootfs(l.config.Config); err != nil {
  104. return err
  105. }
  106. }
  107. if hostname := l.config.Config.Hostname; hostname != "" {
  108. if err := unix.Sethostname([]byte(hostname)); err != nil {
  109. return &os.SyscallError{Syscall: "sethostname", Err: err}
  110. }
  111. }
  112. if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
  113. return fmt.Errorf("unable to apply apparmor profile: %w", err)
  114. }
  115. for key, value := range l.config.Config.Sysctl {
  116. if err := writeSystemProperty(key, value); err != nil {
  117. return err
  118. }
  119. }
  120. for _, path := range l.config.Config.ReadonlyPaths {
  121. if err := readonlyPath(path); err != nil {
  122. return fmt.Errorf("can't make %q read-only: %w", path, err)
  123. }
  124. }
  125. for _, path := range l.config.Config.MaskPaths {
  126. if err := maskPath(path, l.config.Config.MountLabel); err != nil {
  127. return fmt.Errorf("can't mask path %s: %w", path, err)
  128. }
  129. }
  130. pdeath, err := system.GetParentDeathSignal()
  131. if err != nil {
  132. return fmt.Errorf("can't get pdeath signal: %w", err)
  133. }
  134. if l.config.NoNewPrivileges {
  135. if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
  136. return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err}
  137. }
  138. }
  139. // Tell our parent that we're ready to Execv. This must be done before the
  140. // Seccomp rules have been applied, because we need to be able to read and
  141. // write to a socket.
  142. if err := syncParentReady(l.pipe); err != nil {
  143. return fmt.Errorf("sync ready: %w", err)
  144. }
  145. if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
  146. return fmt.Errorf("can't set process label: %w", err)
  147. }
  148. defer selinux.SetExecLabel("") //nolint: errcheck
  149. // Without NoNewPrivileges seccomp is a privileged operation, so we need to
  150. // do this before dropping capabilities; otherwise do it as late as possible
  151. // just before execve so as few syscalls take place after it as possible.
  152. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
  153. seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
  154. if err != nil {
  155. return err
  156. }
  157. if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
  158. return err
  159. }
  160. }
  161. if err := finalizeNamespace(l.config); err != nil {
  162. return err
  163. }
  164. // finalizeNamespace can change user/group which clears the parent death
  165. // signal, so we restore it here.
  166. if err := pdeath.Restore(); err != nil {
  167. return fmt.Errorf("can't restore pdeath signal: %w", err)
  168. }
  169. // Compare the parent from the initial start of the init process and make
  170. // sure that it did not change. if the parent changes that means it died
  171. // and we were reparented to something else so we should just kill ourself
  172. // and not cause problems for someone else.
  173. if unix.Getppid() != l.parentPid {
  174. return unix.Kill(unix.Getpid(), unix.SIGKILL)
  175. }
  176. // Check for the arg before waiting to make sure it exists and it is
  177. // returned as a create time error.
  178. name, err := exec.LookPath(l.config.Args[0])
  179. if err != nil {
  180. return err
  181. }
  182. // exec.LookPath might return no error for an executable residing on a
  183. // file system mounted with noexec flag, so perform this extra check
  184. // now while we can still return a proper error.
  185. if err := system.Eaccess(name); err != nil {
  186. return &os.PathError{Op: "exec", Path: name, Err: err}
  187. }
  188. // Set seccomp as close to execve as possible, so as few syscalls take
  189. // place afterward (reducing the amount of syscalls that users need to
  190. // enable in their seccomp profiles). However, this needs to be done
  191. // before closing the pipe since we need it to pass the seccompFd to
  192. // the parent.
  193. if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
  194. seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
  195. if err != nil {
  196. return fmt.Errorf("unable to init seccomp: %w", err)
  197. }
  198. if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
  199. return err
  200. }
  201. }
  202. // Close the pipe to signal that we have completed our init.
  203. logrus.Debugf("init: closing the pipe to signal completion")
  204. _ = l.pipe.Close()
  205. // Close the log pipe fd so the parent's ForwardLogs can exit.
  206. if err := unix.Close(l.logFd); err != nil {
  207. return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
  208. }
  209. // Wait for the FIFO to be opened on the other side before exec-ing the
  210. // user process. We open it through /proc/self/fd/$fd, because the fd that
  211. // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
  212. // re-open an O_PATH fd through /proc.
  213. fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd)
  214. fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
  215. if err != nil {
  216. return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
  217. }
  218. if _, err := unix.Write(fd, []byte("0")); err != nil {
  219. return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
  220. }
  221. // Close the O_PATH fifofd fd before exec because the kernel resets
  222. // dumpable in the wrong order. This has been fixed in newer kernels, but
  223. // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
  224. // N.B. the core issue itself (passing dirfds to the host filesystem) has
  225. // since been resolved.
  226. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
  227. _ = unix.Close(l.fifoFd)
  228. s := l.config.SpecState
  229. s.Pid = unix.Getpid()
  230. s.Status = specs.StateCreated
  231. if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
  232. return err
  233. }
  234. return system.Exec(name, l.config.Args[0:], os.Environ())
  235. }