config.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. package configs
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "os/exec"
  7. "time"
  8. "github.com/sirupsen/logrus"
  9. "github.com/opencontainers/runc/libcontainer/devices"
  10. "github.com/opencontainers/runtime-spec/specs-go"
  11. )
  12. type Rlimit struct {
  13. Type int `json:"type"`
  14. Hard uint64 `json:"hard"`
  15. Soft uint64 `json:"soft"`
  16. }
  17. // IDMap represents UID/GID Mappings for User Namespaces.
  18. type IDMap struct {
  19. ContainerID int `json:"container_id"`
  20. HostID int `json:"host_id"`
  21. Size int `json:"size"`
  22. }
  23. // Seccomp represents syscall restrictions
  24. // By default, only the native architecture of the kernel is allowed to be used
  25. // for syscalls. Additional architectures can be added by specifying them in
  26. // Architectures.
  27. type Seccomp struct {
  28. DefaultAction Action `json:"default_action"`
  29. Architectures []string `json:"architectures"`
  30. Syscalls []*Syscall `json:"syscalls"`
  31. DefaultErrnoRet *uint `json:"default_errno_ret"`
  32. ListenerPath string `json:"listener_path,omitempty"`
  33. ListenerMetadata string `json:"listener_metadata,omitempty"`
  34. }
  35. // Action is taken upon rule match in Seccomp
  36. type Action int
  37. const (
  38. Kill Action = iota + 1
  39. Errno
  40. Trap
  41. Allow
  42. Trace
  43. Log
  44. Notify
  45. KillThread
  46. KillProcess
  47. )
  48. // Operator is a comparison operator to be used when matching syscall arguments in Seccomp
  49. type Operator int
  50. const (
  51. EqualTo Operator = iota + 1
  52. NotEqualTo
  53. GreaterThan
  54. GreaterThanOrEqualTo
  55. LessThan
  56. LessThanOrEqualTo
  57. MaskEqualTo
  58. )
  59. // Arg is a rule to match a specific syscall argument in Seccomp
  60. type Arg struct {
  61. Index uint `json:"index"`
  62. Value uint64 `json:"value"`
  63. ValueTwo uint64 `json:"value_two"`
  64. Op Operator `json:"op"`
  65. }
  66. // Syscall is a rule to match a syscall in Seccomp
  67. type Syscall struct {
  68. Name string `json:"name"`
  69. Action Action `json:"action"`
  70. ErrnoRet *uint `json:"errnoRet"`
  71. Args []*Arg `json:"args"`
  72. }
  73. // TODO Windows. Many of these fields should be factored out into those parts
  74. // which are common across platforms, and those which are platform specific.
  75. // Config defines configuration options for executing a process inside a contained environment.
  76. type Config struct {
  77. // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
  78. // This is a common option when the container is running in ramdisk
  79. NoPivotRoot bool `json:"no_pivot_root"`
  80. // ParentDeathSignal specifies the signal that is sent to the container's process in the case
  81. // that the parent process dies.
  82. ParentDeathSignal int `json:"parent_death_signal"`
  83. // Path to a directory containing the container's root filesystem.
  84. Rootfs string `json:"rootfs"`
  85. // Umask is the umask to use inside of the container.
  86. Umask *uint32 `json:"umask"`
  87. // Readonlyfs will remount the container's rootfs as readonly where only externally mounted
  88. // bind mounts are writtable.
  89. Readonlyfs bool `json:"readonlyfs"`
  90. // Specifies the mount propagation flags to be applied to /.
  91. RootPropagation int `json:"rootPropagation"`
  92. // Mounts specify additional source and destination paths that will be mounted inside the container's
  93. // rootfs and mount namespace if specified
  94. Mounts []*Mount `json:"mounts"`
  95. // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
  96. Devices []*devices.Device `json:"devices"`
  97. MountLabel string `json:"mount_label"`
  98. // Hostname optionally sets the container's hostname if provided
  99. Hostname string `json:"hostname"`
  100. // Namespaces specifies the container's namespaces that it should setup when cloning the init process
  101. // If a namespace is not provided that namespace is shared from the container's parent process
  102. Namespaces Namespaces `json:"namespaces"`
  103. // Capabilities specify the capabilities to keep when executing the process inside the container
  104. // All capabilities not specified will be dropped from the processes capability mask
  105. Capabilities *Capabilities `json:"capabilities"`
  106. // Networks specifies the container's network setup to be created
  107. Networks []*Network `json:"networks"`
  108. // Routes can be specified to create entries in the route table as the container is started
  109. Routes []*Route `json:"routes"`
  110. // Cgroups specifies specific cgroup settings for the various subsystems that the container is
  111. // placed into to limit the resources the container has available
  112. Cgroups *Cgroup `json:"cgroups"`
  113. // AppArmorProfile specifies the profile to apply to the process running in the container and is
  114. // change at the time the process is execed
  115. AppArmorProfile string `json:"apparmor_profile,omitempty"`
  116. // ProcessLabel specifies the label to apply to the process running in the container. It is
  117. // commonly used by selinux
  118. ProcessLabel string `json:"process_label,omitempty"`
  119. // Rlimits specifies the resource limits, such as max open files, to set in the container
  120. // If Rlimits are not set, the container will inherit rlimits from the parent process
  121. Rlimits []Rlimit `json:"rlimits,omitempty"`
  122. // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
  123. // for a process. Valid values are between the range [-1000, '1000'], where processes with
  124. // higher scores are preferred for being killed. If it is unset then we don't touch the current
  125. // value.
  126. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
  127. OomScoreAdj *int `json:"oom_score_adj,omitempty"`
  128. // UidMappings is an array of User ID mappings for User Namespaces
  129. UidMappings []IDMap `json:"uid_mappings"`
  130. // GidMappings is an array of Group ID mappings for User Namespaces
  131. GidMappings []IDMap `json:"gid_mappings"`
  132. // MaskPaths specifies paths within the container's rootfs to mask over with a bind
  133. // mount pointing to /dev/null as to prevent reads of the file.
  134. MaskPaths []string `json:"mask_paths"`
  135. // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
  136. // so that these files prevent any writes.
  137. ReadonlyPaths []string `json:"readonly_paths"`
  138. // Sysctl is a map of properties and their values. It is the equivalent of using
  139. // sysctl -w my.property.name value in Linux.
  140. Sysctl map[string]string `json:"sysctl"`
  141. // Seccomp allows actions to be taken whenever a syscall is made within the container.
  142. // A number of rules are given, each having an action to be taken if a syscall matches it.
  143. // A default action to be taken if no rules match is also given.
  144. Seccomp *Seccomp `json:"seccomp"`
  145. // NoNewPrivileges controls whether processes in the container can gain additional privileges.
  146. NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
  147. // Hooks are a collection of actions to perform at various container lifecycle events.
  148. // CommandHooks are serialized to JSON, but other hooks are not.
  149. Hooks Hooks
  150. // Version is the version of opencontainer specification that is supported.
  151. Version string `json:"version"`
  152. // Labels are user defined metadata that is stored in the config and populated on the state
  153. Labels []string `json:"labels"`
  154. // NoNewKeyring will not allocated a new session keyring for the container. It will use the
  155. // callers keyring in this case.
  156. NoNewKeyring bool `json:"no_new_keyring"`
  157. // IntelRdt specifies settings for Intel RDT group that the container is placed into
  158. // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
  159. IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
  160. // RootlessEUID is set when the runc was launched with non-zero EUID.
  161. // Note that RootlessEUID is set to false when launched with EUID=0 in userns.
  162. // When RootlessEUID is set, runc creates a new userns for the container.
  163. // (config.json needs to contain userns settings)
  164. RootlessEUID bool `json:"rootless_euid,omitempty"`
  165. // RootlessCgroups is set when unlikely to have the full access to cgroups.
  166. // When RootlessCgroups is set, cgroups errors are ignored.
  167. RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
  168. }
  169. type (
  170. HookName string
  171. HookList []Hook
  172. Hooks map[HookName]HookList
  173. )
  174. const (
  175. // Prestart commands are executed after the container namespaces are created,
  176. // but before the user supplied command is executed from init.
  177. // Note: This hook is now deprecated
  178. // Prestart commands are called in the Runtime namespace.
  179. Prestart HookName = "prestart"
  180. // CreateRuntime commands MUST be called as part of the create operation after
  181. // the runtime environment has been created but before the pivot_root has been executed.
  182. // CreateRuntime is called immediately after the deprecated Prestart hook.
  183. // CreateRuntime commands are called in the Runtime Namespace.
  184. CreateRuntime HookName = "createRuntime"
  185. // CreateContainer commands MUST be called as part of the create operation after
  186. // the runtime environment has been created but before the pivot_root has been executed.
  187. // CreateContainer commands are called in the Container namespace.
  188. CreateContainer HookName = "createContainer"
  189. // StartContainer commands MUST be called as part of the start operation and before
  190. // the container process is started.
  191. // StartContainer commands are called in the Container namespace.
  192. StartContainer HookName = "startContainer"
  193. // Poststart commands are executed after the container init process starts.
  194. // Poststart commands are called in the Runtime Namespace.
  195. Poststart HookName = "poststart"
  196. // Poststop commands are executed after the container init process exits.
  197. // Poststop commands are called in the Runtime Namespace.
  198. Poststop HookName = "poststop"
  199. )
  200. // KnownHookNames returns the known hook names.
  201. // Used by `runc features`.
  202. func KnownHookNames() []string {
  203. return []string{
  204. string(Prestart), // deprecated
  205. string(CreateRuntime),
  206. string(CreateContainer),
  207. string(StartContainer),
  208. string(Poststart),
  209. string(Poststop),
  210. }
  211. }
  212. type Capabilities struct {
  213. // Bounding is the set of capabilities checked by the kernel.
  214. Bounding []string
  215. // Effective is the set of capabilities checked by the kernel.
  216. Effective []string
  217. // Inheritable is the capabilities preserved across execve.
  218. Inheritable []string
  219. // Permitted is the limiting superset for effective capabilities.
  220. Permitted []string
  221. // Ambient is the ambient set of capabilities that are kept.
  222. Ambient []string
  223. }
  224. func (hooks HookList) RunHooks(state *specs.State) error {
  225. for i, h := range hooks {
  226. if err := h.Run(state); err != nil {
  227. return fmt.Errorf("error running hook #%d: %w", i, err)
  228. }
  229. }
  230. return nil
  231. }
  232. func (hooks *Hooks) UnmarshalJSON(b []byte) error {
  233. var state map[HookName][]CommandHook
  234. if err := json.Unmarshal(b, &state); err != nil {
  235. return err
  236. }
  237. *hooks = Hooks{}
  238. for n, commandHooks := range state {
  239. if len(commandHooks) == 0 {
  240. continue
  241. }
  242. (*hooks)[n] = HookList{}
  243. for _, h := range commandHooks {
  244. (*hooks)[n] = append((*hooks)[n], h)
  245. }
  246. }
  247. return nil
  248. }
  249. func (hooks *Hooks) MarshalJSON() ([]byte, error) {
  250. serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
  251. for _, hook := range hooks {
  252. switch chook := hook.(type) {
  253. case CommandHook:
  254. serializableHooks = append(serializableHooks, chook)
  255. default:
  256. logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
  257. }
  258. }
  259. return serializableHooks
  260. }
  261. return json.Marshal(map[string]interface{}{
  262. "prestart": serialize((*hooks)[Prestart]),
  263. "createRuntime": serialize((*hooks)[CreateRuntime]),
  264. "createContainer": serialize((*hooks)[CreateContainer]),
  265. "startContainer": serialize((*hooks)[StartContainer]),
  266. "poststart": serialize((*hooks)[Poststart]),
  267. "poststop": serialize((*hooks)[Poststop]),
  268. })
  269. }
  270. type Hook interface {
  271. // Run executes the hook with the provided state.
  272. Run(*specs.State) error
  273. }
  274. // NewFunctionHook will call the provided function when the hook is run.
  275. func NewFunctionHook(f func(*specs.State) error) FuncHook {
  276. return FuncHook{
  277. run: f,
  278. }
  279. }
  280. type FuncHook struct {
  281. run func(*specs.State) error
  282. }
  283. func (f FuncHook) Run(s *specs.State) error {
  284. return f.run(s)
  285. }
  286. type Command struct {
  287. Path string `json:"path"`
  288. Args []string `json:"args"`
  289. Env []string `json:"env"`
  290. Dir string `json:"dir"`
  291. Timeout *time.Duration `json:"timeout"`
  292. }
  293. // NewCommandHook will execute the provided command when the hook is run.
  294. func NewCommandHook(cmd Command) CommandHook {
  295. return CommandHook{
  296. Command: cmd,
  297. }
  298. }
  299. type CommandHook struct {
  300. Command
  301. }
  302. func (c Command) Run(s *specs.State) error {
  303. b, err := json.Marshal(s)
  304. if err != nil {
  305. return err
  306. }
  307. var stdout, stderr bytes.Buffer
  308. cmd := exec.Cmd{
  309. Path: c.Path,
  310. Args: c.Args,
  311. Env: c.Env,
  312. Stdin: bytes.NewReader(b),
  313. Stdout: &stdout,
  314. Stderr: &stderr,
  315. }
  316. if err := cmd.Start(); err != nil {
  317. return err
  318. }
  319. errC := make(chan error, 1)
  320. go func() {
  321. err := cmd.Wait()
  322. if err != nil {
  323. err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
  324. }
  325. errC <- err
  326. }()
  327. var timerCh <-chan time.Time
  328. if c.Timeout != nil {
  329. timer := time.NewTimer(*c.Timeout)
  330. defer timer.Stop()
  331. timerCh = timer.C
  332. }
  333. select {
  334. case err := <-errC:
  335. return err
  336. case <-timerCh:
  337. _ = cmd.Process.Kill()
  338. <-errC
  339. return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
  340. }
  341. }