| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472 |
- package systemd
- import (
- "bufio"
- "errors"
- "fmt"
- "math"
- "os"
- "path/filepath"
- "strconv"
- "strings"
- "sync"
- systemdDbus "github.com/coreos/go-systemd/v22/dbus"
- securejoin "github.com/cyphar/filepath-securejoin"
- "github.com/sirupsen/logrus"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
- "github.com/opencontainers/runc/libcontainer/configs"
- )
- type unifiedManager struct {
- mu sync.Mutex
- cgroups *configs.Cgroup
- // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
- path string
- dbus *dbusConnManager
- fsMgr cgroups.Manager
- }
- func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
- m := &unifiedManager{
- cgroups: config,
- path: path,
- dbus: newDbusConnManager(config.Rootless),
- }
- if err := m.initPath(); err != nil {
- return nil, err
- }
- fsMgr, err := fs2.NewManager(config, m.path)
- if err != nil {
- return nil, err
- }
- m.fsMgr = fsMgr
- return m, nil
- }
- // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
- // key/value map (where key is cgroupfs file name) to systemd unit properties.
- // This is on a best-effort basis, so the properties that are not known
- // (to this function and/or systemd) are ignored (but logged with "debug"
- // log level).
- //
- // For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
- //
- // For the list of systemd unit properties, see systemd.resource-control(5).
- func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
- var err error
- for k, v := range res {
- if strings.Contains(k, "/") {
- return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
- }
- sk := strings.SplitN(k, ".", 2)
- if len(sk) != 2 {
- return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
- }
- // Kernel is quite forgiving to extra whitespace
- // around the value, and so should we.
- v = strings.TrimSpace(v)
- // Please keep cases in alphabetical order.
- switch k {
- case "cpu.max":
- // value: quota [period]
- quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
- period := defCPUQuotaPeriod
- sv := strings.Fields(v)
- if len(sv) < 1 || len(sv) > 2 {
- return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
- }
- // quota
- if sv[0] != "max" {
- quota, err = strconv.ParseInt(sv[0], 10, 64)
- if err != nil {
- return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
- }
- }
- // period
- if len(sv) == 2 {
- period, err = strconv.ParseUint(sv[1], 10, 64)
- if err != nil {
- return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
- }
- }
- addCpuQuota(cm, &props, quota, period)
- case "cpu.weight":
- num, err := strconv.ParseUint(v, 10, 64)
- if err != nil {
- return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
- }
- props = append(props,
- newProp("CPUWeight", num))
- case "cpuset.cpus", "cpuset.mems":
- bits, err := RangeToBits(v)
- if err != nil {
- return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
- }
- m := map[string]string{
- "cpuset.cpus": "AllowedCPUs",
- "cpuset.mems": "AllowedMemoryNodes",
- }
- // systemd only supports these properties since v244
- sdVer := systemdVersion(cm)
- if sdVer >= 244 {
- props = append(props,
- newProp(m[k], bits))
- } else {
- logrus.Debugf("systemd v%d is too old to support %s"+
- " (setting will still be applied to cgroupfs)",
- sdVer, m[k])
- }
- case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
- num := uint64(math.MaxUint64)
- if v != "max" {
- num, err = strconv.ParseUint(v, 10, 64)
- if err != nil {
- return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
- }
- }
- m := map[string]string{
- "memory.high": "MemoryHigh",
- "memory.low": "MemoryLow",
- "memory.min": "MemoryMin",
- "memory.max": "MemoryMax",
- "memory.swap.max": "MemorySwapMax",
- }
- props = append(props,
- newProp(m[k], num))
- case "pids.max":
- num := uint64(math.MaxUint64)
- if v != "max" {
- var err error
- num, err = strconv.ParseUint(v, 10, 64)
- if err != nil {
- return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
- }
- }
- props = append(props,
- newProp("TasksMax", num))
- case "memory.oom.group":
- // Setting this to 1 is roughly equivalent to OOMPolicy=kill
- // (as per systemd.service(5) and
- // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
- // but it's not clear what to do if it is unset or set
- // to 0 in runc update, as there are two other possible
- // values for OOMPolicy (continue/stop).
- fallthrough
- default:
- // Ignore the unknown resource here -- will still be
- // applied in Set which calls fs2.Set.
- logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
- }
- }
- return props, nil
- }
- func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
- var properties []systemdDbus.Property
- // NOTE: This is of questionable correctness because we insert our own
- // devices eBPF program later. Two programs with identical rules
- // aren't the end of the world, but it is a bit concerning. However
- // it's unclear if systemd removes all eBPF programs attached when
- // doing SetUnitProperties...
- deviceProperties, err := generateDeviceProperties(r)
- if err != nil {
- return nil, err
- }
- properties = append(properties, deviceProperties...)
- if r.Memory != 0 {
- properties = append(properties,
- newProp("MemoryMax", uint64(r.Memory)))
- }
- if r.MemoryReservation != 0 {
- properties = append(properties,
- newProp("MemoryLow", uint64(r.MemoryReservation)))
- }
- swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
- if err != nil {
- return nil, err
- }
- if swap != 0 {
- properties = append(properties,
- newProp("MemorySwapMax", uint64(swap)))
- }
- if r.CpuWeight != 0 {
- properties = append(properties,
- newProp("CPUWeight", r.CpuWeight))
- }
- addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
- if r.PidsLimit > 0 || r.PidsLimit == -1 {
- properties = append(properties,
- newProp("TasksMax", uint64(r.PidsLimit)))
- }
- err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
- if err != nil {
- return nil, err
- }
- // ignore r.KernelMemory
- // convert Resources.Unified map to systemd properties
- if r.Unified != nil {
- unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
- if err != nil {
- return nil, err
- }
- properties = append(properties, unifiedProps...)
- }
- return properties, nil
- }
- func (m *unifiedManager) Apply(pid int) error {
- var (
- c = m.cgroups
- unitName = getUnitName(c)
- properties []systemdDbus.Property
- )
- slice := "system.slice"
- if m.cgroups.Rootless {
- slice = "user.slice"
- }
- if c.Parent != "" {
- slice = c.Parent
- }
- properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
- if strings.HasSuffix(unitName, ".slice") {
- // If we create a slice, the parent is defined via a Wants=.
- properties = append(properties, systemdDbus.PropWants(slice))
- } else {
- // Otherwise it's a scope, which we put into a Slice=.
- properties = append(properties, systemdDbus.PropSlice(slice))
- // Assume scopes always support delegation (supported since systemd v218).
- properties = append(properties, newProp("Delegate", true))
- }
- // only add pid if its valid, -1 is used w/ general slice creation.
- if pid != -1 {
- properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
- }
- // Always enable accounting, this gets us the same behaviour as the fs implementation,
- // plus the kernel has some problems with joining the memory cgroup at a later time.
- properties = append(properties,
- newProp("MemoryAccounting", true),
- newProp("CPUAccounting", true),
- newProp("IOAccounting", true),
- newProp("TasksAccounting", true),
- )
- // Assume DefaultDependencies= will always work (the check for it was previously broken.)
- properties = append(properties,
- newProp("DefaultDependencies", false))
- properties = append(properties, c.SystemdProps...)
- if err := startUnit(m.dbus, unitName, properties); err != nil {
- return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
- }
- if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
- return err
- }
- if c.OwnerUID != nil {
- // The directory itself must be chowned.
- err := os.Chown(m.path, *c.OwnerUID, -1)
- if err != nil {
- return err
- }
- filesToChown, err := cgroupFilesToChown()
- if err != nil {
- return err
- }
- for _, v := range filesToChown {
- err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
- // Some files might not be present.
- if err != nil && !errors.Is(err, os.ErrNotExist) {
- return err
- }
- }
- }
- return nil
- }
- // The kernel exposes a list of files that should be chowned to the delegate
- // uid in /sys/kernel/cgroup/delegate. If the file is not present
- // (Linux < 4.15), use the initial values mentioned in cgroups(7).
- func cgroupFilesToChown() ([]string, error) {
- const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
- f, err := os.Open(cgroupDelegateFile)
- if err != nil {
- return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
- }
- defer f.Close()
- filesToChown := []string{}
- scanner := bufio.NewScanner(f)
- for scanner.Scan() {
- filesToChown = append(filesToChown, scanner.Text())
- }
- if err := scanner.Err(); err != nil {
- return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
- }
- return filesToChown, nil
- }
- func (m *unifiedManager) Destroy() error {
- m.mu.Lock()
- defer m.mu.Unlock()
- unitName := getUnitName(m.cgroups)
- if err := stopUnit(m.dbus, unitName); err != nil {
- return err
- }
- // systemd 239 do not remove sub-cgroups.
- err := m.fsMgr.Destroy()
- // fsMgr.Destroy has handled ErrNotExist
- if err != nil {
- return err
- }
- return nil
- }
- func (m *unifiedManager) Path(_ string) string {
- return m.path
- }
- // getSliceFull value is used in initPath.
- // The value is incompatible with systemdDbus.PropSlice.
- func (m *unifiedManager) getSliceFull() (string, error) {
- c := m.cgroups
- slice := "system.slice"
- if c.Rootless {
- slice = "user.slice"
- }
- if c.Parent != "" {
- var err error
- slice, err = ExpandSlice(c.Parent)
- if err != nil {
- return "", err
- }
- }
- if c.Rootless {
- // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
- managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
- if err != nil {
- return "", err
- }
- slice = filepath.Join(managerCG, slice)
- }
- // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
- // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
- return slice, nil
- }
- func (m *unifiedManager) initPath() error {
- if m.path != "" {
- return nil
- }
- sliceFull, err := m.getSliceFull()
- if err != nil {
- return err
- }
- c := m.cgroups
- path := filepath.Join(sliceFull, getUnitName(c))
- path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
- if err != nil {
- return err
- }
- // an example of the final path in rootless:
- // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
- m.path = path
- return nil
- }
- func (m *unifiedManager) Freeze(state configs.FreezerState) error {
- return m.fsMgr.Freeze(state)
- }
- func (m *unifiedManager) GetPids() ([]int, error) {
- return cgroups.GetPids(m.path)
- }
- func (m *unifiedManager) GetAllPids() ([]int, error) {
- return cgroups.GetAllPids(m.path)
- }
- func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
- return m.fsMgr.GetStats()
- }
- func (m *unifiedManager) Set(r *configs.Resources) error {
- if r == nil {
- return nil
- }
- properties, err := genV2ResourcesProperties(r, m.dbus)
- if err != nil {
- return err
- }
- if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
- return fmt.Errorf("unable to set unit properties: %w", err)
- }
- return m.fsMgr.Set(r)
- }
- func (m *unifiedManager) GetPaths() map[string]string {
- paths := make(map[string]string, 1)
- paths[""] = m.path
- return paths
- }
- func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
- return m.cgroups, nil
- }
- func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
- return m.fsMgr.GetFreezerState()
- }
- func (m *unifiedManager) Exists() bool {
- return cgroups.PathExists(m.path)
- }
- func (m *unifiedManager) OOMKillCount() (uint64, error) {
- return m.fsMgr.OOMKillCount()
- }
|