v1.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. package systemd
  2. import (
  3. "errors"
  4. "os"
  5. "path/filepath"
  6. "reflect"
  7. "strings"
  8. "sync"
  9. systemdDbus "github.com/coreos/go-systemd/v22/dbus"
  10. "github.com/godbus/dbus/v5"
  11. "github.com/sirupsen/logrus"
  12. "github.com/opencontainers/runc/libcontainer/cgroups"
  13. "github.com/opencontainers/runc/libcontainer/cgroups/fs"
  14. "github.com/opencontainers/runc/libcontainer/configs"
  15. )
  16. type legacyManager struct {
  17. mu sync.Mutex
  18. cgroups *configs.Cgroup
  19. paths map[string]string
  20. dbus *dbusConnManager
  21. }
  22. func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
  23. if cg.Rootless {
  24. return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
  25. }
  26. if cg.Resources != nil && cg.Resources.Unified != nil {
  27. return nil, cgroups.ErrV1NoUnified
  28. }
  29. if paths == nil {
  30. var err error
  31. paths, err = initPaths(cg)
  32. if err != nil {
  33. return nil, err
  34. }
  35. }
  36. return &legacyManager{
  37. cgroups: cg,
  38. paths: paths,
  39. dbus: newDbusConnManager(false),
  40. }, nil
  41. }
  42. type subsystem interface {
  43. // Name returns the name of the subsystem.
  44. Name() string
  45. // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
  46. GetStats(path string, stats *cgroups.Stats) error
  47. // Set sets cgroup resource limits.
  48. Set(path string, r *configs.Resources) error
  49. }
  50. var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
  51. var legacySubsystems = []subsystem{
  52. &fs.CpusetGroup{},
  53. &fs.DevicesGroup{},
  54. &fs.MemoryGroup{},
  55. &fs.CpuGroup{},
  56. &fs.CpuacctGroup{},
  57. &fs.PidsGroup{},
  58. &fs.BlkioGroup{},
  59. &fs.HugetlbGroup{},
  60. &fs.PerfEventGroup{},
  61. &fs.FreezerGroup{},
  62. &fs.NetPrioGroup{},
  63. &fs.NetClsGroup{},
  64. &fs.NameGroup{GroupName: "name=systemd"},
  65. &fs.RdmaGroup{},
  66. }
  67. func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
  68. var properties []systemdDbus.Property
  69. deviceProperties, err := generateDeviceProperties(r)
  70. if err != nil {
  71. return nil, err
  72. }
  73. properties = append(properties, deviceProperties...)
  74. if r.Memory != 0 {
  75. properties = append(properties,
  76. newProp("MemoryLimit", uint64(r.Memory)))
  77. }
  78. if r.CpuShares != 0 {
  79. properties = append(properties,
  80. newProp("CPUShares", r.CpuShares))
  81. }
  82. addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
  83. if r.BlkioWeight != 0 {
  84. properties = append(properties,
  85. newProp("BlockIOWeight", uint64(r.BlkioWeight)))
  86. }
  87. if r.PidsLimit > 0 || r.PidsLimit == -1 {
  88. properties = append(properties,
  89. newProp("TasksMax", uint64(r.PidsLimit)))
  90. }
  91. err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
  92. if err != nil {
  93. return nil, err
  94. }
  95. return properties, nil
  96. }
  97. // initPaths figures out and returns paths to cgroups.
  98. func initPaths(c *configs.Cgroup) (map[string]string, error) {
  99. slice := "system.slice"
  100. if c.Parent != "" {
  101. var err error
  102. slice, err = ExpandSlice(c.Parent)
  103. if err != nil {
  104. return nil, err
  105. }
  106. }
  107. unit := getUnitName(c)
  108. paths := make(map[string]string)
  109. for _, s := range legacySubsystems {
  110. subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
  111. if err != nil {
  112. // Even if it's `not found` error, we'll return err
  113. // because devices cgroup is hard requirement for
  114. // container security.
  115. if s.Name() == "devices" {
  116. return nil, err
  117. }
  118. // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
  119. if cgroups.IsNotFound(err) {
  120. continue
  121. }
  122. return nil, err
  123. }
  124. paths[s.Name()] = subsystemPath
  125. }
  126. // If systemd is using cgroups-hybrid mode then add the slice path of
  127. // this container to the paths so the following process executed with
  128. // "runc exec" joins that cgroup as well.
  129. if cgroups.IsCgroup2HybridMode() {
  130. // "" means cgroup-hybrid path
  131. cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
  132. if err != nil && cgroups.IsNotFound(err) {
  133. return nil, err
  134. }
  135. paths[""] = cgroupsHybridPath
  136. }
  137. return paths, nil
  138. }
  139. func (m *legacyManager) Apply(pid int) error {
  140. var (
  141. c = m.cgroups
  142. unitName = getUnitName(c)
  143. slice = "system.slice"
  144. properties []systemdDbus.Property
  145. )
  146. m.mu.Lock()
  147. defer m.mu.Unlock()
  148. if c.Parent != "" {
  149. slice = c.Parent
  150. }
  151. properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
  152. if strings.HasSuffix(unitName, ".slice") {
  153. // If we create a slice, the parent is defined via a Wants=.
  154. properties = append(properties, systemdDbus.PropWants(slice))
  155. } else {
  156. // Otherwise it's a scope, which we put into a Slice=.
  157. properties = append(properties, systemdDbus.PropSlice(slice))
  158. // Assume scopes always support delegation (supported since systemd v218).
  159. properties = append(properties, newProp("Delegate", true))
  160. }
  161. // only add pid if its valid, -1 is used w/ general slice creation.
  162. if pid != -1 {
  163. properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
  164. }
  165. // Always enable accounting, this gets us the same behaviour as the fs implementation,
  166. // plus the kernel has some problems with joining the memory cgroup at a later time.
  167. properties = append(properties,
  168. newProp("MemoryAccounting", true),
  169. newProp("CPUAccounting", true),
  170. newProp("BlockIOAccounting", true),
  171. newProp("TasksAccounting", true),
  172. )
  173. // Assume DefaultDependencies= will always work (the check for it was previously broken.)
  174. properties = append(properties,
  175. newProp("DefaultDependencies", false))
  176. properties = append(properties, c.SystemdProps...)
  177. if err := startUnit(m.dbus, unitName, properties); err != nil {
  178. return err
  179. }
  180. if err := m.joinCgroups(pid); err != nil {
  181. return err
  182. }
  183. return nil
  184. }
  185. func (m *legacyManager) Destroy() error {
  186. m.mu.Lock()
  187. defer m.mu.Unlock()
  188. stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
  189. // Both on success and on error, cleanup all the cgroups
  190. // we are aware of, as some of them were created directly
  191. // by Apply() and are not managed by systemd.
  192. if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
  193. return err
  194. }
  195. return stopErr
  196. }
  197. func (m *legacyManager) Path(subsys string) string {
  198. m.mu.Lock()
  199. defer m.mu.Unlock()
  200. return m.paths[subsys]
  201. }
  202. func (m *legacyManager) joinCgroups(pid int) error {
  203. for _, sys := range legacySubsystems {
  204. name := sys.Name()
  205. switch name {
  206. case "name=systemd":
  207. // let systemd handle this
  208. case "cpuset":
  209. if path, ok := m.paths[name]; ok {
  210. s := &fs.CpusetGroup{}
  211. if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
  212. return err
  213. }
  214. }
  215. default:
  216. if path, ok := m.paths[name]; ok {
  217. if err := os.MkdirAll(path, 0o755); err != nil {
  218. return err
  219. }
  220. if err := cgroups.WriteCgroupProc(path, pid); err != nil {
  221. return err
  222. }
  223. }
  224. }
  225. }
  226. return nil
  227. }
  228. func getSubsystemPath(slice, unit, subsystem string) (string, error) {
  229. mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
  230. if err != nil {
  231. return "", err
  232. }
  233. initPath, err := cgroups.GetInitCgroup(subsystem)
  234. if err != nil {
  235. return "", err
  236. }
  237. // if pid 1 is systemd 226 or later, it will be in init.scope, not the root
  238. initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
  239. return filepath.Join(mountpoint, initPath, slice, unit), nil
  240. }
  241. func (m *legacyManager) Freeze(state configs.FreezerState) error {
  242. err := m.doFreeze(state)
  243. if err == nil {
  244. m.cgroups.Resources.Freezer = state
  245. }
  246. return err
  247. }
  248. // doFreeze is the same as Freeze but without
  249. // changing the m.cgroups.Resources.Frozen field.
  250. func (m *legacyManager) doFreeze(state configs.FreezerState) error {
  251. path, ok := m.paths["freezer"]
  252. if !ok {
  253. return errSubsystemDoesNotExist
  254. }
  255. freezer := &fs.FreezerGroup{}
  256. resources := &configs.Resources{Freezer: state}
  257. return freezer.Set(path, resources)
  258. }
  259. func (m *legacyManager) GetPids() ([]int, error) {
  260. path, ok := m.paths["devices"]
  261. if !ok {
  262. return nil, errSubsystemDoesNotExist
  263. }
  264. return cgroups.GetPids(path)
  265. }
  266. func (m *legacyManager) GetAllPids() ([]int, error) {
  267. path, ok := m.paths["devices"]
  268. if !ok {
  269. return nil, errSubsystemDoesNotExist
  270. }
  271. return cgroups.GetAllPids(path)
  272. }
  273. func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
  274. m.mu.Lock()
  275. defer m.mu.Unlock()
  276. stats := cgroups.NewStats()
  277. for _, sys := range legacySubsystems {
  278. path := m.paths[sys.Name()]
  279. if path == "" {
  280. continue
  281. }
  282. if err := sys.GetStats(path, stats); err != nil {
  283. return nil, err
  284. }
  285. }
  286. return stats, nil
  287. }
  288. // freezeBeforeSet answers whether there is a need to freeze the cgroup before
  289. // applying its systemd unit properties, and thaw after, while avoiding
  290. // unnecessary freezer state changes.
  291. //
  292. // The reason why we have to freeze is that systemd's application of device
  293. // rules is done disruptively, resulting in spurious errors to common devices
  294. // (unlike our fs driver, they will happily write deny-all rules to running
  295. // containers). So we have to freeze the container to avoid the container get
  296. // an occasional "permission denied" error.
  297. func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) {
  298. // Special case for SkipDevices, as used by Kubernetes to create pod
  299. // cgroups with allow-all device policy).
  300. if r.SkipDevices {
  301. if r.SkipFreezeOnSet {
  302. // Both needsFreeze and needsThaw are false.
  303. return
  304. }
  305. // No need to freeze if SkipDevices is set, and either
  306. // (1) systemd unit does not (yet) exist, or
  307. // (2) it has DevicePolicy=auto and empty DeviceAllow list.
  308. //
  309. // Interestingly, (1) and (2) are the same here because
  310. // a non-existent unit returns default properties,
  311. // and settings in (2) are the defaults.
  312. //
  313. // Do not return errors from getUnitTypeProperty, as they alone
  314. // should not prevent Set from working.
  315. unitType := getUnitType(unitName)
  316. devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
  317. if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
  318. devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
  319. if e == nil {
  320. if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
  321. needsFreeze = false
  322. needsThaw = false
  323. return
  324. }
  325. }
  326. }
  327. }
  328. needsFreeze = true
  329. needsThaw = true
  330. // Check the current freezer state.
  331. freezerState, err := m.GetFreezerState()
  332. if err != nil {
  333. return
  334. }
  335. if freezerState == configs.Frozen {
  336. // Already frozen, and should stay frozen.
  337. needsFreeze = false
  338. needsThaw = false
  339. }
  340. if r.Freezer == configs.Frozen {
  341. // Will be frozen anyway -- no need to thaw.
  342. needsThaw = false
  343. }
  344. return
  345. }
  346. func (m *legacyManager) Set(r *configs.Resources) error {
  347. if r == nil {
  348. return nil
  349. }
  350. if r.Unified != nil {
  351. return cgroups.ErrV1NoUnified
  352. }
  353. properties, err := genV1ResourcesProperties(r, m.dbus)
  354. if err != nil {
  355. return err
  356. }
  357. unitName := getUnitName(m.cgroups)
  358. needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r)
  359. if err != nil {
  360. return err
  361. }
  362. if needsFreeze {
  363. if err := m.doFreeze(configs.Frozen); err != nil {
  364. // If freezer cgroup isn't supported, we just warn about it.
  365. logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
  366. }
  367. }
  368. setErr := setUnitProperties(m.dbus, unitName, properties...)
  369. if needsThaw {
  370. if err := m.doFreeze(configs.Thawed); err != nil {
  371. logrus.Infof("thaw container after SetUnitProperties failed: %v", err)
  372. }
  373. }
  374. if setErr != nil {
  375. return setErr
  376. }
  377. for _, sys := range legacySubsystems {
  378. // Get the subsystem path, but don't error out for not found cgroups.
  379. path, ok := m.paths[sys.Name()]
  380. if !ok {
  381. continue
  382. }
  383. if err := sys.Set(path, r); err != nil {
  384. return err
  385. }
  386. }
  387. return nil
  388. }
  389. func (m *legacyManager) GetPaths() map[string]string {
  390. m.mu.Lock()
  391. defer m.mu.Unlock()
  392. return m.paths
  393. }
  394. func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
  395. return m.cgroups, nil
  396. }
  397. func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
  398. path, ok := m.paths["freezer"]
  399. if !ok {
  400. return configs.Undefined, nil
  401. }
  402. freezer := &fs.FreezerGroup{}
  403. return freezer.GetState(path)
  404. }
  405. func (m *legacyManager) Exists() bool {
  406. return cgroups.PathExists(m.Path("devices"))
  407. }
  408. func (m *legacyManager) OOMKillCount() (uint64, error) {
  409. return fs.OOMKillCount(m.Path("memory"))
  410. }