v2.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. package systemd
  2. import (
  3. "bufio"
  4. "errors"
  5. "fmt"
  6. "math"
  7. "os"
  8. "path/filepath"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. systemdDbus "github.com/coreos/go-systemd/v22/dbus"
  13. securejoin "github.com/cyphar/filepath-securejoin"
  14. "github.com/sirupsen/logrus"
  15. "github.com/opencontainers/runc/libcontainer/cgroups"
  16. "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
  17. "github.com/opencontainers/runc/libcontainer/configs"
  18. )
  19. type unifiedManager struct {
  20. mu sync.Mutex
  21. cgroups *configs.Cgroup
  22. // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
  23. path string
  24. dbus *dbusConnManager
  25. fsMgr cgroups.Manager
  26. }
  27. func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
  28. m := &unifiedManager{
  29. cgroups: config,
  30. path: path,
  31. dbus: newDbusConnManager(config.Rootless),
  32. }
  33. if err := m.initPath(); err != nil {
  34. return nil, err
  35. }
  36. fsMgr, err := fs2.NewManager(config, m.path)
  37. if err != nil {
  38. return nil, err
  39. }
  40. m.fsMgr = fsMgr
  41. return m, nil
  42. }
  43. // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
  44. // key/value map (where key is cgroupfs file name) to systemd unit properties.
  45. // This is on a best-effort basis, so the properties that are not known
  46. // (to this function and/or systemd) are ignored (but logged with "debug"
  47. // log level).
  48. //
  49. // For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
  50. //
  51. // For the list of systemd unit properties, see systemd.resource-control(5).
  52. func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
  53. var err error
  54. for k, v := range res {
  55. if strings.Contains(k, "/") {
  56. return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
  57. }
  58. sk := strings.SplitN(k, ".", 2)
  59. if len(sk) != 2 {
  60. return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
  61. }
  62. // Kernel is quite forgiving to extra whitespace
  63. // around the value, and so should we.
  64. v = strings.TrimSpace(v)
  65. // Please keep cases in alphabetical order.
  66. switch k {
  67. case "cpu.max":
  68. // value: quota [period]
  69. quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
  70. period := defCPUQuotaPeriod
  71. sv := strings.Fields(v)
  72. if len(sv) < 1 || len(sv) > 2 {
  73. return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
  74. }
  75. // quota
  76. if sv[0] != "max" {
  77. quota, err = strconv.ParseInt(sv[0], 10, 64)
  78. if err != nil {
  79. return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
  80. }
  81. }
  82. // period
  83. if len(sv) == 2 {
  84. period, err = strconv.ParseUint(sv[1], 10, 64)
  85. if err != nil {
  86. return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
  87. }
  88. }
  89. addCpuQuota(cm, &props, quota, period)
  90. case "cpu.weight":
  91. num, err := strconv.ParseUint(v, 10, 64)
  92. if err != nil {
  93. return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
  94. }
  95. props = append(props,
  96. newProp("CPUWeight", num))
  97. case "cpuset.cpus", "cpuset.mems":
  98. bits, err := RangeToBits(v)
  99. if err != nil {
  100. return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
  101. }
  102. m := map[string]string{
  103. "cpuset.cpus": "AllowedCPUs",
  104. "cpuset.mems": "AllowedMemoryNodes",
  105. }
  106. // systemd only supports these properties since v244
  107. sdVer := systemdVersion(cm)
  108. if sdVer >= 244 {
  109. props = append(props,
  110. newProp(m[k], bits))
  111. } else {
  112. logrus.Debugf("systemd v%d is too old to support %s"+
  113. " (setting will still be applied to cgroupfs)",
  114. sdVer, m[k])
  115. }
  116. case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
  117. num := uint64(math.MaxUint64)
  118. if v != "max" {
  119. num, err = strconv.ParseUint(v, 10, 64)
  120. if err != nil {
  121. return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
  122. }
  123. }
  124. m := map[string]string{
  125. "memory.high": "MemoryHigh",
  126. "memory.low": "MemoryLow",
  127. "memory.min": "MemoryMin",
  128. "memory.max": "MemoryMax",
  129. "memory.swap.max": "MemorySwapMax",
  130. }
  131. props = append(props,
  132. newProp(m[k], num))
  133. case "pids.max":
  134. num := uint64(math.MaxUint64)
  135. if v != "max" {
  136. var err error
  137. num, err = strconv.ParseUint(v, 10, 64)
  138. if err != nil {
  139. return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
  140. }
  141. }
  142. props = append(props,
  143. newProp("TasksMax", num))
  144. case "memory.oom.group":
  145. // Setting this to 1 is roughly equivalent to OOMPolicy=kill
  146. // (as per systemd.service(5) and
  147. // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
  148. // but it's not clear what to do if it is unset or set
  149. // to 0 in runc update, as there are two other possible
  150. // values for OOMPolicy (continue/stop).
  151. fallthrough
  152. default:
  153. // Ignore the unknown resource here -- will still be
  154. // applied in Set which calls fs2.Set.
  155. logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
  156. }
  157. }
  158. return props, nil
  159. }
  160. func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
  161. var properties []systemdDbus.Property
  162. // NOTE: This is of questionable correctness because we insert our own
  163. // devices eBPF program later. Two programs with identical rules
  164. // aren't the end of the world, but it is a bit concerning. However
  165. // it's unclear if systemd removes all eBPF programs attached when
  166. // doing SetUnitProperties...
  167. deviceProperties, err := generateDeviceProperties(r)
  168. if err != nil {
  169. return nil, err
  170. }
  171. properties = append(properties, deviceProperties...)
  172. if r.Memory != 0 {
  173. properties = append(properties,
  174. newProp("MemoryMax", uint64(r.Memory)))
  175. }
  176. if r.MemoryReservation != 0 {
  177. properties = append(properties,
  178. newProp("MemoryLow", uint64(r.MemoryReservation)))
  179. }
  180. swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
  181. if err != nil {
  182. return nil, err
  183. }
  184. if swap != 0 {
  185. properties = append(properties,
  186. newProp("MemorySwapMax", uint64(swap)))
  187. }
  188. if r.CpuWeight != 0 {
  189. properties = append(properties,
  190. newProp("CPUWeight", r.CpuWeight))
  191. }
  192. addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
  193. if r.PidsLimit > 0 || r.PidsLimit == -1 {
  194. properties = append(properties,
  195. newProp("TasksMax", uint64(r.PidsLimit)))
  196. }
  197. err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
  198. if err != nil {
  199. return nil, err
  200. }
  201. // ignore r.KernelMemory
  202. // convert Resources.Unified map to systemd properties
  203. if r.Unified != nil {
  204. unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
  205. if err != nil {
  206. return nil, err
  207. }
  208. properties = append(properties, unifiedProps...)
  209. }
  210. return properties, nil
  211. }
  212. func (m *unifiedManager) Apply(pid int) error {
  213. var (
  214. c = m.cgroups
  215. unitName = getUnitName(c)
  216. properties []systemdDbus.Property
  217. )
  218. slice := "system.slice"
  219. if m.cgroups.Rootless {
  220. slice = "user.slice"
  221. }
  222. if c.Parent != "" {
  223. slice = c.Parent
  224. }
  225. properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
  226. if strings.HasSuffix(unitName, ".slice") {
  227. // If we create a slice, the parent is defined via a Wants=.
  228. properties = append(properties, systemdDbus.PropWants(slice))
  229. } else {
  230. // Otherwise it's a scope, which we put into a Slice=.
  231. properties = append(properties, systemdDbus.PropSlice(slice))
  232. // Assume scopes always support delegation (supported since systemd v218).
  233. properties = append(properties, newProp("Delegate", true))
  234. }
  235. // only add pid if its valid, -1 is used w/ general slice creation.
  236. if pid != -1 {
  237. properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
  238. }
  239. // Always enable accounting, this gets us the same behaviour as the fs implementation,
  240. // plus the kernel has some problems with joining the memory cgroup at a later time.
  241. properties = append(properties,
  242. newProp("MemoryAccounting", true),
  243. newProp("CPUAccounting", true),
  244. newProp("IOAccounting", true),
  245. newProp("TasksAccounting", true),
  246. )
  247. // Assume DefaultDependencies= will always work (the check for it was previously broken.)
  248. properties = append(properties,
  249. newProp("DefaultDependencies", false))
  250. properties = append(properties, c.SystemdProps...)
  251. if err := startUnit(m.dbus, unitName, properties); err != nil {
  252. return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
  253. }
  254. if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
  255. return err
  256. }
  257. if c.OwnerUID != nil {
  258. // The directory itself must be chowned.
  259. err := os.Chown(m.path, *c.OwnerUID, -1)
  260. if err != nil {
  261. return err
  262. }
  263. filesToChown, err := cgroupFilesToChown()
  264. if err != nil {
  265. return err
  266. }
  267. for _, v := range filesToChown {
  268. err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
  269. // Some files might not be present.
  270. if err != nil && !errors.Is(err, os.ErrNotExist) {
  271. return err
  272. }
  273. }
  274. }
  275. return nil
  276. }
  277. // The kernel exposes a list of files that should be chowned to the delegate
  278. // uid in /sys/kernel/cgroup/delegate. If the file is not present
  279. // (Linux < 4.15), use the initial values mentioned in cgroups(7).
  280. func cgroupFilesToChown() ([]string, error) {
  281. const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
  282. f, err := os.Open(cgroupDelegateFile)
  283. if err != nil {
  284. return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
  285. }
  286. defer f.Close()
  287. filesToChown := []string{}
  288. scanner := bufio.NewScanner(f)
  289. for scanner.Scan() {
  290. filesToChown = append(filesToChown, scanner.Text())
  291. }
  292. if err := scanner.Err(); err != nil {
  293. return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
  294. }
  295. return filesToChown, nil
  296. }
  297. func (m *unifiedManager) Destroy() error {
  298. m.mu.Lock()
  299. defer m.mu.Unlock()
  300. unitName := getUnitName(m.cgroups)
  301. if err := stopUnit(m.dbus, unitName); err != nil {
  302. return err
  303. }
  304. // systemd 239 do not remove sub-cgroups.
  305. err := m.fsMgr.Destroy()
  306. // fsMgr.Destroy has handled ErrNotExist
  307. if err != nil {
  308. return err
  309. }
  310. return nil
  311. }
  312. func (m *unifiedManager) Path(_ string) string {
  313. return m.path
  314. }
  315. // getSliceFull value is used in initPath.
  316. // The value is incompatible with systemdDbus.PropSlice.
  317. func (m *unifiedManager) getSliceFull() (string, error) {
  318. c := m.cgroups
  319. slice := "system.slice"
  320. if c.Rootless {
  321. slice = "user.slice"
  322. }
  323. if c.Parent != "" {
  324. var err error
  325. slice, err = ExpandSlice(c.Parent)
  326. if err != nil {
  327. return "", err
  328. }
  329. }
  330. if c.Rootless {
  331. // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
  332. managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
  333. if err != nil {
  334. return "", err
  335. }
  336. slice = filepath.Join(managerCG, slice)
  337. }
  338. // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
  339. // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
  340. return slice, nil
  341. }
  342. func (m *unifiedManager) initPath() error {
  343. if m.path != "" {
  344. return nil
  345. }
  346. sliceFull, err := m.getSliceFull()
  347. if err != nil {
  348. return err
  349. }
  350. c := m.cgroups
  351. path := filepath.Join(sliceFull, getUnitName(c))
  352. path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
  353. if err != nil {
  354. return err
  355. }
  356. // an example of the final path in rootless:
  357. // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
  358. m.path = path
  359. return nil
  360. }
  361. func (m *unifiedManager) Freeze(state configs.FreezerState) error {
  362. return m.fsMgr.Freeze(state)
  363. }
  364. func (m *unifiedManager) GetPids() ([]int, error) {
  365. return cgroups.GetPids(m.path)
  366. }
  367. func (m *unifiedManager) GetAllPids() ([]int, error) {
  368. return cgroups.GetAllPids(m.path)
  369. }
  370. func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
  371. return m.fsMgr.GetStats()
  372. }
  373. func (m *unifiedManager) Set(r *configs.Resources) error {
  374. if r == nil {
  375. return nil
  376. }
  377. properties, err := genV2ResourcesProperties(r, m.dbus)
  378. if err != nil {
  379. return err
  380. }
  381. if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
  382. return fmt.Errorf("unable to set unit properties: %w", err)
  383. }
  384. return m.fsMgr.Set(r)
  385. }
  386. func (m *unifiedManager) GetPaths() map[string]string {
  387. paths := make(map[string]string, 1)
  388. paths[""] = m.path
  389. return paths
  390. }
  391. func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
  392. return m.cgroups, nil
  393. }
  394. func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
  395. return m.fsMgr.GetFreezerState()
  396. }
  397. func (m *unifiedManager) Exists() bool {
  398. return cgroups.PathExists(m.path)
  399. }
  400. func (m *unifiedManager) OOMKillCount() (uint64, error) {
  401. return m.fsMgr.OOMKillCount()
  402. }