init_linux.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. package libcontainer
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "net"
  9. "os"
  10. "strings"
  11. "unsafe"
  12. "github.com/containerd/console"
  13. "github.com/opencontainers/runtime-spec/specs-go"
  14. "github.com/sirupsen/logrus"
  15. "github.com/vishvananda/netlink"
  16. "golang.org/x/sys/unix"
  17. "github.com/opencontainers/runc/libcontainer/capabilities"
  18. "github.com/opencontainers/runc/libcontainer/cgroups"
  19. "github.com/opencontainers/runc/libcontainer/configs"
  20. "github.com/opencontainers/runc/libcontainer/system"
  21. "github.com/opencontainers/runc/libcontainer/user"
  22. "github.com/opencontainers/runc/libcontainer/utils"
  23. )
  24. type initType string
  25. const (
  26. initSetns initType = "setns"
  27. initStandard initType = "standard"
  28. )
  29. type pid struct {
  30. Pid int `json:"stage2_pid"`
  31. PidFirstChild int `json:"stage1_pid"`
  32. }
  33. // network is an internal struct used to setup container networks.
  34. type network struct {
  35. configs.Network
  36. // TempVethPeerName is a unique temporary veth peer name that was placed into
  37. // the container's namespace.
  38. TempVethPeerName string `json:"temp_veth_peer_name"`
  39. }
  40. // initConfig is used for transferring parameters from Exec() to Init()
  41. type initConfig struct {
  42. Args []string `json:"args"`
  43. Env []string `json:"env"`
  44. Cwd string `json:"cwd"`
  45. Capabilities *configs.Capabilities `json:"capabilities"`
  46. ProcessLabel string `json:"process_label"`
  47. AppArmorProfile string `json:"apparmor_profile"`
  48. NoNewPrivileges bool `json:"no_new_privileges"`
  49. User string `json:"user"`
  50. AdditionalGroups []string `json:"additional_groups"`
  51. Config *configs.Config `json:"config"`
  52. Networks []*network `json:"network"`
  53. PassedFilesCount int `json:"passed_files_count"`
  54. ContainerId string `json:"containerid"`
  55. Rlimits []configs.Rlimit `json:"rlimits"`
  56. CreateConsole bool `json:"create_console"`
  57. ConsoleWidth uint16 `json:"console_width"`
  58. ConsoleHeight uint16 `json:"console_height"`
  59. RootlessEUID bool `json:"rootless_euid,omitempty"`
  60. RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
  61. SpecState *specs.State `json:"spec_state,omitempty"`
  62. Cgroup2Path string `json:"cgroup2_path,omitempty"`
  63. }
  64. type initer interface {
  65. Init() error
  66. }
  67. func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) {
  68. var config *initConfig
  69. if err := json.NewDecoder(pipe).Decode(&config); err != nil {
  70. return nil, err
  71. }
  72. if err := populateProcessEnvironment(config.Env); err != nil {
  73. return nil, err
  74. }
  75. switch t {
  76. case initSetns:
  77. // mountFds must be nil in this case. We don't mount while doing runc exec.
  78. if mountFds != nil {
  79. return nil, errors.New("mountFds must be nil. Can't mount while doing runc exec.")
  80. }
  81. return &linuxSetnsInit{
  82. pipe: pipe,
  83. consoleSocket: consoleSocket,
  84. config: config,
  85. logFd: logFd,
  86. }, nil
  87. case initStandard:
  88. return &linuxStandardInit{
  89. pipe: pipe,
  90. consoleSocket: consoleSocket,
  91. parentPid: unix.Getppid(),
  92. config: config,
  93. fifoFd: fifoFd,
  94. logFd: logFd,
  95. mountFds: mountFds,
  96. }, nil
  97. }
  98. return nil, fmt.Errorf("unknown init type %q", t)
  99. }
  100. // populateProcessEnvironment loads the provided environment variables into the
  101. // current processes's environment.
  102. func populateProcessEnvironment(env []string) error {
  103. for _, pair := range env {
  104. p := strings.SplitN(pair, "=", 2)
  105. if len(p) < 2 {
  106. return fmt.Errorf("invalid environment variable: %q", pair)
  107. }
  108. name, val := p[0], p[1]
  109. if name == "" {
  110. return fmt.Errorf("environment variable name can't be empty: %q", pair)
  111. }
  112. if strings.IndexByte(name, 0) >= 0 {
  113. return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair)
  114. }
  115. if strings.IndexByte(val, 0) >= 0 {
  116. return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair)
  117. }
  118. if err := os.Setenv(name, val); err != nil {
  119. return err
  120. }
  121. }
  122. return nil
  123. }
  124. // finalizeNamespace drops the caps, sets the correct user
  125. // and working dir, and closes any leaked file descriptors
  126. // before executing the command inside the namespace
  127. func finalizeNamespace(config *initConfig) error {
  128. // Ensure that all unwanted fds we may have accidentally
  129. // inherited are marked close-on-exec so they stay out of the
  130. // container
  131. if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
  132. return fmt.Errorf("error closing exec fds: %w", err)
  133. }
  134. // we only do chdir if it's specified
  135. doChdir := config.Cwd != ""
  136. if doChdir {
  137. // First, attempt the chdir before setting up the user.
  138. // This could allow us to access a directory that the user running runc can access
  139. // but the container user cannot.
  140. err := unix.Chdir(config.Cwd)
  141. switch {
  142. case err == nil:
  143. doChdir = false
  144. case os.IsPermission(err):
  145. // If we hit an EPERM, we should attempt again after setting up user.
  146. // This will allow us to successfully chdir if the container user has access
  147. // to the directory, but the user running runc does not.
  148. // This is useful in cases where the cwd is also a volume that's been chowned to the container user.
  149. default:
  150. return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
  151. }
  152. }
  153. caps := &configs.Capabilities{}
  154. if config.Capabilities != nil {
  155. caps = config.Capabilities
  156. } else if config.Config.Capabilities != nil {
  157. caps = config.Config.Capabilities
  158. }
  159. w, err := capabilities.New(caps)
  160. if err != nil {
  161. return err
  162. }
  163. // drop capabilities in bounding set before changing user
  164. if err := w.ApplyBoundingSet(); err != nil {
  165. return fmt.Errorf("unable to apply bounding set: %w", err)
  166. }
  167. // preserve existing capabilities while we change users
  168. if err := system.SetKeepCaps(); err != nil {
  169. return fmt.Errorf("unable to set keep caps: %w", err)
  170. }
  171. if err := setupUser(config); err != nil {
  172. return fmt.Errorf("unable to setup user: %w", err)
  173. }
  174. // Change working directory AFTER the user has been set up, if we haven't done it yet.
  175. if doChdir {
  176. if err := unix.Chdir(config.Cwd); err != nil {
  177. return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
  178. }
  179. }
  180. if err := system.ClearKeepCaps(); err != nil {
  181. return fmt.Errorf("unable to clear keep caps: %w", err)
  182. }
  183. if err := w.ApplyCaps(); err != nil {
  184. return fmt.Errorf("unable to apply caps: %w", err)
  185. }
  186. return nil
  187. }
  188. // setupConsole sets up the console from inside the container, and sends the
  189. // master pty fd to the config.Pipe (using cmsg). This is done to ensure that
  190. // consoles are scoped to a container properly (see runc#814 and the many
  191. // issues related to that). This has to be run *after* we've pivoted to the new
  192. // rootfs (and the users' configuration is entirely set up).
  193. func setupConsole(socket *os.File, config *initConfig, mount bool) error {
  194. defer socket.Close()
  195. // At this point, /dev/ptmx points to something that we would expect. We
  196. // used to change the owner of the slave path, but since the /dev/pts mount
  197. // can have gid=X set (at the users' option). So touching the owner of the
  198. // slave PTY is not necessary, as the kernel will handle that for us. Note
  199. // however, that setupUser (specifically fixStdioPermissions) *will* change
  200. // the UID owner of the console to be the user the process will run as (so
  201. // they can actually control their console).
  202. pty, slavePath, err := console.NewPty()
  203. if err != nil {
  204. return err
  205. }
  206. // After we return from here, we don't need the console anymore.
  207. defer pty.Close()
  208. if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
  209. err = pty.Resize(console.WinSize{
  210. Height: config.ConsoleHeight,
  211. Width: config.ConsoleWidth,
  212. })
  213. if err != nil {
  214. return err
  215. }
  216. }
  217. // Mount the console inside our rootfs.
  218. if mount {
  219. if err := mountConsole(slavePath); err != nil {
  220. return err
  221. }
  222. }
  223. // While we can access console.master, using the API is a good idea.
  224. if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
  225. return err
  226. }
  227. // Now, dup over all the things.
  228. return dupStdio(slavePath)
  229. }
  230. // syncParentReady sends to the given pipe a JSON payload which indicates that
  231. // the init is ready to Exec the child process. It then waits for the parent to
  232. // indicate that it is cleared to Exec.
  233. func syncParentReady(pipe io.ReadWriter) error {
  234. // Tell parent.
  235. if err := writeSync(pipe, procReady); err != nil {
  236. return err
  237. }
  238. // Wait for parent to give the all-clear.
  239. return readSync(pipe, procRun)
  240. }
  241. // syncParentHooks sends to the given pipe a JSON payload which indicates that
  242. // the parent should execute pre-start hooks. It then waits for the parent to
  243. // indicate that it is cleared to resume.
  244. func syncParentHooks(pipe io.ReadWriter) error {
  245. // Tell parent.
  246. if err := writeSync(pipe, procHooks); err != nil {
  247. return err
  248. }
  249. // Wait for parent to give the all-clear.
  250. return readSync(pipe, procResume)
  251. }
  252. // syncParentSeccomp sends to the given pipe a JSON payload which
  253. // indicates that the parent should pick up the seccomp fd with pidfd_getfd()
  254. // and send it to the seccomp agent over a unix socket. It then waits for
  255. // the parent to indicate that it is cleared to resume and closes the seccompFd.
  256. // If the seccompFd is -1, there isn't anything to sync with the parent, so it
  257. // returns no error.
  258. func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
  259. if seccompFd == -1 {
  260. return nil
  261. }
  262. // Tell parent.
  263. if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
  264. unix.Close(seccompFd)
  265. return err
  266. }
  267. // Wait for parent to give the all-clear.
  268. if err := readSync(pipe, procSeccompDone); err != nil {
  269. unix.Close(seccompFd)
  270. return fmt.Errorf("sync parent seccomp: %w", err)
  271. }
  272. if err := unix.Close(seccompFd); err != nil {
  273. return fmt.Errorf("close seccomp fd: %w", err)
  274. }
  275. return nil
  276. }
  277. // setupUser changes the groups, gid, and uid for the user inside the container
  278. func setupUser(config *initConfig) error {
  279. // Set up defaults.
  280. defaultExecUser := user.ExecUser{
  281. Uid: 0,
  282. Gid: 0,
  283. Home: "/",
  284. }
  285. passwdPath, err := user.GetPasswdPath()
  286. if err != nil {
  287. return err
  288. }
  289. groupPath, err := user.GetGroupPath()
  290. if err != nil {
  291. return err
  292. }
  293. execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
  294. if err != nil {
  295. return err
  296. }
  297. var addGroups []int
  298. if len(config.AdditionalGroups) > 0 {
  299. addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
  300. if err != nil {
  301. return err
  302. }
  303. }
  304. // Rather than just erroring out later in setuid(2) and setgid(2), check
  305. // that the user is mapped here.
  306. if _, err := config.Config.HostUID(execUser.Uid); err != nil {
  307. return errors.New("cannot set uid to unmapped user in user namespace")
  308. }
  309. if _, err := config.Config.HostGID(execUser.Gid); err != nil {
  310. return errors.New("cannot set gid to unmapped user in user namespace")
  311. }
  312. if config.RootlessEUID {
  313. // We cannot set any additional groups in a rootless container and thus
  314. // we bail if the user asked us to do so. TODO: We currently can't do
  315. // this check earlier, but if libcontainer.Process.User was typesafe
  316. // this might work.
  317. if len(addGroups) > 0 {
  318. return errors.New("cannot set any additional groups in a rootless container")
  319. }
  320. }
  321. // Before we change to the container's user make sure that the processes
  322. // STDIO is correctly owned by the user that we are switching to.
  323. if err := fixStdioPermissions(execUser); err != nil {
  324. return err
  325. }
  326. setgroups, err := os.ReadFile("/proc/self/setgroups")
  327. if err != nil && !os.IsNotExist(err) {
  328. return err
  329. }
  330. // This isn't allowed in an unprivileged user namespace since Linux 3.19.
  331. // There's nothing we can do about /etc/group entries, so we silently
  332. // ignore setting groups here (since the user didn't explicitly ask us to
  333. // set the group).
  334. allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
  335. if allowSupGroups {
  336. suppGroups := append(execUser.Sgids, addGroups...)
  337. if err := unix.Setgroups(suppGroups); err != nil {
  338. return &os.SyscallError{Syscall: "setgroups", Err: err}
  339. }
  340. }
  341. if err := system.Setgid(execUser.Gid); err != nil {
  342. return err
  343. }
  344. if err := system.Setuid(execUser.Uid); err != nil {
  345. return err
  346. }
  347. // if we didn't get HOME already, set it based on the user's HOME
  348. if envHome := os.Getenv("HOME"); envHome == "" {
  349. if err := os.Setenv("HOME", execUser.Home); err != nil {
  350. return err
  351. }
  352. }
  353. return nil
  354. }
  355. // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
  356. // The ownership needs to match because it is created outside of the container and needs to be
  357. // localized.
  358. func fixStdioPermissions(u *user.ExecUser) error {
  359. var null unix.Stat_t
  360. if err := unix.Stat("/dev/null", &null); err != nil {
  361. return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
  362. }
  363. for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
  364. var s unix.Stat_t
  365. if err := unix.Fstat(int(file.Fd()), &s); err != nil {
  366. return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
  367. }
  368. // Skip chown if uid is already the one we want or any of the STDIO descriptors
  369. // were redirected to /dev/null.
  370. if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
  371. continue
  372. }
  373. // We only change the uid (as it is possible for the mount to
  374. // prefer a different gid, and there's no reason for us to change it).
  375. // The reason why we don't just leave the default uid=X mount setup is
  376. // that users expect to be able to actually use their console. Without
  377. // this code, you couldn't effectively run as a non-root user inside a
  378. // container and also have a console set up.
  379. if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
  380. // If we've hit an EINVAL then s.Gid isn't mapped in the user
  381. // namespace. If we've hit an EPERM then the inode's current owner
  382. // is not mapped in our user namespace (in particular,
  383. // privileged_wrt_inode_uidgid() has failed). Read-only
  384. // /dev can result in EROFS error. In any case, it's
  385. // better for us to just not touch the stdio rather
  386. // than bail at this point.
  387. if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
  388. continue
  389. }
  390. return err
  391. }
  392. }
  393. return nil
  394. }
  395. // setupNetwork sets up and initializes any network interface inside the container.
  396. func setupNetwork(config *initConfig) error {
  397. for _, config := range config.Networks {
  398. strategy, err := getStrategy(config.Type)
  399. if err != nil {
  400. return err
  401. }
  402. if err := strategy.initialize(config); err != nil {
  403. return err
  404. }
  405. }
  406. return nil
  407. }
  408. func setupRoute(config *configs.Config) error {
  409. for _, config := range config.Routes {
  410. _, dst, err := net.ParseCIDR(config.Destination)
  411. if err != nil {
  412. return err
  413. }
  414. src := net.ParseIP(config.Source)
  415. if src == nil {
  416. return fmt.Errorf("Invalid source for route: %s", config.Source)
  417. }
  418. gw := net.ParseIP(config.Gateway)
  419. if gw == nil {
  420. return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
  421. }
  422. l, err := netlink.LinkByName(config.InterfaceName)
  423. if err != nil {
  424. return err
  425. }
  426. route := &netlink.Route{
  427. Scope: netlink.SCOPE_UNIVERSE,
  428. Dst: dst,
  429. Src: src,
  430. Gw: gw,
  431. LinkIndex: l.Attrs().Index,
  432. }
  433. if err := netlink.RouteAdd(route); err != nil {
  434. return err
  435. }
  436. }
  437. return nil
  438. }
  439. func setupRlimits(limits []configs.Rlimit, pid int) error {
  440. for _, rlimit := range limits {
  441. if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
  442. return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
  443. }
  444. }
  445. return nil
  446. }
  447. const _P_PID = 1
  448. //nolint:structcheck,unused
  449. type siginfo struct {
  450. si_signo int32
  451. si_errno int32
  452. si_code int32
  453. // below here is a union; si_pid is the only field we use
  454. si_pid int32
  455. // Pad to 128 bytes as detailed in blockUntilWaitable
  456. pad [96]byte
  457. }
  458. // isWaitable returns true if the process has exited false otherwise.
  459. // Its based off blockUntilWaitable in src/os/wait_waitid.go
  460. func isWaitable(pid int) (bool, error) {
  461. si := &siginfo{}
  462. _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
  463. if e != 0 {
  464. return false, &os.SyscallError{Syscall: "waitid", Err: e}
  465. }
  466. return si.si_pid != 0, nil
  467. }
  468. // signalAllProcesses freezes then iterates over all the processes inside the
  469. // manager's cgroups sending the signal s to them.
  470. // If s is SIGKILL then it will wait for each process to exit.
  471. // For all other signals it will check if the process is ready to report its
  472. // exit status and only if it is will a wait be performed.
  473. func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
  474. var procs []*os.Process
  475. if err := m.Freeze(configs.Frozen); err != nil {
  476. logrus.Warn(err)
  477. }
  478. pids, err := m.GetAllPids()
  479. if err != nil {
  480. if err := m.Freeze(configs.Thawed); err != nil {
  481. logrus.Warn(err)
  482. }
  483. return err
  484. }
  485. for _, pid := range pids {
  486. p, err := os.FindProcess(pid)
  487. if err != nil {
  488. logrus.Warn(err)
  489. continue
  490. }
  491. procs = append(procs, p)
  492. if err := p.Signal(s); err != nil {
  493. logrus.Warn(err)
  494. }
  495. }
  496. if err := m.Freeze(configs.Thawed); err != nil {
  497. logrus.Warn(err)
  498. }
  499. subreaper, err := system.GetSubreaper()
  500. if err != nil {
  501. // The error here means that PR_GET_CHILD_SUBREAPER is not
  502. // supported because this code might run on a kernel older
  503. // than 3.4. We don't want to throw an error in that case,
  504. // and we simplify things, considering there is no subreaper
  505. // set.
  506. subreaper = 0
  507. }
  508. for _, p := range procs {
  509. if s != unix.SIGKILL {
  510. if ok, err := isWaitable(p.Pid); err != nil {
  511. if !errors.Is(err, unix.ECHILD) {
  512. logrus.Warn("signalAllProcesses: ", p.Pid, err)
  513. }
  514. continue
  515. } else if !ok {
  516. // Not ready to report so don't wait
  517. continue
  518. }
  519. }
  520. // In case a subreaper has been setup, this code must not
  521. // wait for the process. Otherwise, we cannot be sure the
  522. // current process will be reaped by the subreaper, while
  523. // the subreaper might be waiting for this process in order
  524. // to retrieve its exit code.
  525. if subreaper == 0 {
  526. if _, err := p.Wait(); err != nil {
  527. if !errors.Is(err, unix.ECHILD) {
  528. logrus.Warn("wait: ", err)
  529. }
  530. }
  531. }
  532. }
  533. return nil
  534. }