common.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. package systemd
  2. import (
  3. "bufio"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "math"
  8. "os"
  9. "regexp"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "time"
  14. systemdDbus "github.com/coreos/go-systemd/v22/dbus"
  15. dbus "github.com/godbus/dbus/v5"
  16. "github.com/sirupsen/logrus"
  17. cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
  18. "github.com/opencontainers/runc/libcontainer/configs"
  19. "github.com/opencontainers/runc/libcontainer/devices"
  20. )
  21. const (
  22. // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
  23. // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
  24. // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
  25. defCPUQuotaPeriod = uint64(100000)
  26. )
  27. var (
  28. versionOnce sync.Once
  29. version int
  30. isRunningSystemdOnce sync.Once
  31. isRunningSystemd bool
  32. )
  33. // NOTE: This function comes from package github.com/coreos/go-systemd/util
  34. // It was borrowed here to avoid a dependency on cgo.
  35. //
  36. // IsRunningSystemd checks whether the host was booted with systemd as its init
  37. // system. This functions similarly to systemd's `sd_booted(3)`: internally, it
  38. // checks whether /run/systemd/system/ exists and is a directory.
  39. // http://www.freedesktop.org/software/systemd/man/sd_booted.html
  40. func IsRunningSystemd() bool {
  41. isRunningSystemdOnce.Do(func() {
  42. fi, err := os.Lstat("/run/systemd/system")
  43. isRunningSystemd = err == nil && fi.IsDir()
  44. })
  45. return isRunningSystemd
  46. }
  47. // systemd represents slice hierarchy using `-`, so we need to follow suit when
  48. // generating the path of slice. Essentially, test-a-b.slice becomes
  49. // /test.slice/test-a.slice/test-a-b.slice.
  50. func ExpandSlice(slice string) (string, error) {
  51. suffix := ".slice"
  52. // Name has to end with ".slice", but can't be just ".slice".
  53. if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
  54. return "", fmt.Errorf("invalid slice name: %s", slice)
  55. }
  56. // Path-separators are not allowed.
  57. if strings.Contains(slice, "/") {
  58. return "", fmt.Errorf("invalid slice name: %s", slice)
  59. }
  60. var path, prefix string
  61. sliceName := strings.TrimSuffix(slice, suffix)
  62. // if input was -.slice, we should just return root now
  63. if sliceName == "-" {
  64. return "/", nil
  65. }
  66. for _, component := range strings.Split(sliceName, "-") {
  67. // test--a.slice isn't permitted, nor is -test.slice.
  68. if component == "" {
  69. return "", fmt.Errorf("invalid slice name: %s", slice)
  70. }
  71. // Append the component to the path and to the prefix.
  72. path += "/" + prefix + component + suffix
  73. prefix += component + "-"
  74. }
  75. return path, nil
  76. }
  77. func groupPrefix(ruleType devices.Type) (string, error) {
  78. switch ruleType {
  79. case devices.BlockDevice:
  80. return "block-", nil
  81. case devices.CharDevice:
  82. return "char-", nil
  83. default:
  84. return "", fmt.Errorf("device type %v has no group prefix", ruleType)
  85. }
  86. }
  87. // findDeviceGroup tries to find the device group name (as listed in
  88. // /proc/devices) with the type prefixed as required for DeviceAllow, for a
  89. // given (type, major) combination. If more than one device group exists, an
  90. // arbitrary one is chosen.
  91. func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
  92. fh, err := os.Open("/proc/devices")
  93. if err != nil {
  94. return "", err
  95. }
  96. defer fh.Close()
  97. prefix, err := groupPrefix(ruleType)
  98. if err != nil {
  99. return "", err
  100. }
  101. scanner := bufio.NewScanner(fh)
  102. var currentType devices.Type
  103. for scanner.Scan() {
  104. // We need to strip spaces because the first number is column-aligned.
  105. line := strings.TrimSpace(scanner.Text())
  106. // Handle the "header" lines.
  107. switch line {
  108. case "Block devices:":
  109. currentType = devices.BlockDevice
  110. continue
  111. case "Character devices:":
  112. currentType = devices.CharDevice
  113. continue
  114. case "":
  115. continue
  116. }
  117. // Skip lines unrelated to our type.
  118. if currentType != ruleType {
  119. continue
  120. }
  121. // Parse out the (major, name).
  122. var (
  123. currMajor int64
  124. currName string
  125. )
  126. if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
  127. if err == nil {
  128. err = errors.New("wrong number of fields")
  129. }
  130. return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
  131. }
  132. if currMajor == ruleMajor {
  133. return prefix + currName, nil
  134. }
  135. }
  136. if err := scanner.Err(); err != nil {
  137. return "", fmt.Errorf("reading /proc/devices: %w", err)
  138. }
  139. // Couldn't find the device group.
  140. return "", nil
  141. }
  142. // DeviceAllow is the dbus type "a(ss)" which means we need a struct
  143. // to represent it in Go.
  144. type deviceAllowEntry struct {
  145. Path string
  146. Perms string
  147. }
  148. func allowAllDevices() []systemdDbus.Property {
  149. // Setting mode to auto and removing all DeviceAllow rules
  150. // results in allowing access to all devices.
  151. return []systemdDbus.Property{
  152. newProp("DevicePolicy", "auto"),
  153. newProp("DeviceAllow", []deviceAllowEntry{}),
  154. }
  155. }
  156. // generateDeviceProperties takes the configured device rules and generates a
  157. // corresponding set of systemd properties to configure the devices correctly.
  158. func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
  159. if r.SkipDevices {
  160. return nil, nil
  161. }
  162. properties := []systemdDbus.Property{
  163. // Always run in the strictest white-list mode.
  164. newProp("DevicePolicy", "strict"),
  165. // Empty the DeviceAllow array before filling it.
  166. newProp("DeviceAllow", []deviceAllowEntry{}),
  167. }
  168. // Figure out the set of rules.
  169. configEmu := &cgroupdevices.Emulator{}
  170. for _, rule := range r.Devices {
  171. if err := configEmu.Apply(*rule); err != nil {
  172. return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
  173. }
  174. }
  175. // systemd doesn't support blacklists. So we log a warning, and tell
  176. // systemd to act as a deny-all whitelist. This ruleset will be replaced
  177. // with our normal fallback code. This may result in spurious errors, but
  178. // the only other option is to error out here.
  179. if configEmu.IsBlacklist() {
  180. // However, if we're dealing with an allow-all rule then we can do it.
  181. if configEmu.IsAllowAll() {
  182. return allowAllDevices(), nil
  183. }
  184. logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
  185. return properties, nil
  186. }
  187. // Now generate the set of rules we actually need to apply. Unlike the
  188. // normal devices cgroup, in "strict" mode systemd defaults to a deny-all
  189. // whitelist which is the default for devices.Emulator.
  190. finalRules, err := configEmu.Rules()
  191. if err != nil {
  192. return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
  193. }
  194. var deviceAllowList []deviceAllowEntry
  195. for _, rule := range finalRules {
  196. if !rule.Allow {
  197. // Should never happen.
  198. return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
  199. }
  200. switch rule.Type {
  201. case devices.BlockDevice, devices.CharDevice:
  202. default:
  203. // Should never happen.
  204. return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
  205. }
  206. entry := deviceAllowEntry{
  207. Perms: string(rule.Permissions),
  208. }
  209. // systemd has a fairly odd (though understandable) syntax here, and
  210. // because of the OCI configuration format we have to do quite a bit of
  211. // trickery to convert things:
  212. //
  213. // * Concrete rules with non-wildcard major/minor numbers have to use
  214. // /dev/{block,char} paths. This is slightly odd because it means
  215. // that we cannot add whitelist rules for devices that don't exist,
  216. // but there's not too much we can do about that.
  217. //
  218. // However, path globbing is not support for path-based rules so we
  219. // need to handle wildcards in some other manner.
  220. //
  221. // * Wildcard-minor rules have to specify a "device group name" (the
  222. // second column in /proc/devices).
  223. //
  224. // * Wildcard (major and minor) rules can just specify a glob with the
  225. // type ("char-*" or "block-*").
  226. //
  227. // The only type of rule we can't handle is wildcard-major rules, and
  228. // so we'll give a warning in that case (note that the fallback code
  229. // will insert any rules systemd couldn't handle). What amazing fun.
  230. if rule.Major == devices.Wildcard {
  231. // "_ *:n _" rules aren't supported by systemd.
  232. if rule.Minor != devices.Wildcard {
  233. logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
  234. continue
  235. }
  236. // "_ *:* _" rules just wildcard everything.
  237. prefix, err := groupPrefix(rule.Type)
  238. if err != nil {
  239. return nil, err
  240. }
  241. entry.Path = prefix + "*"
  242. } else if rule.Minor == devices.Wildcard {
  243. // "_ n:* _" rules require a device group from /proc/devices.
  244. group, err := findDeviceGroup(rule.Type, rule.Major)
  245. if err != nil {
  246. return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
  247. }
  248. if group == "" {
  249. // Couldn't find a group.
  250. logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
  251. continue
  252. }
  253. entry.Path = group
  254. } else {
  255. // "_ n:m _" rules are just a path in /dev/{block,char}/.
  256. switch rule.Type {
  257. case devices.BlockDevice:
  258. entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
  259. case devices.CharDevice:
  260. entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
  261. }
  262. // systemd will issue a warning if the path we give here doesn't exist.
  263. // Since all of this logic is best-effort anyway (we manually set these
  264. // rules separately to systemd) we can safely skip entries that don't
  265. // have a corresponding path.
  266. if _, err := os.Stat(entry.Path); err != nil {
  267. // Also check /sys/dev so that we don't depend on /dev/{block,char}
  268. // being populated. (/dev/{block,char} is populated by udev, which
  269. // isn't strictly required for systemd). Ironically, this happens most
  270. // easily when starting containerd within a runc created container
  271. // itself.
  272. // We don't bother with securejoin here because we create entry.Path
  273. // right above here, so we know it's safe.
  274. if _, err := os.Stat("/sys" + entry.Path); err != nil {
  275. logrus.Warnf("skipping device %s for systemd: %s", entry.Path, err)
  276. continue
  277. }
  278. }
  279. }
  280. deviceAllowList = append(deviceAllowList, entry)
  281. }
  282. properties = append(properties, newProp("DeviceAllow", deviceAllowList))
  283. return properties, nil
  284. }
  285. func newProp(name string, units interface{}) systemdDbus.Property {
  286. return systemdDbus.Property{
  287. Name: name,
  288. Value: dbus.MakeVariant(units),
  289. }
  290. }
  291. func getUnitName(c *configs.Cgroup) string {
  292. // by default, we create a scope unless the user explicitly asks for a slice.
  293. if !strings.HasSuffix(c.Name, ".slice") {
  294. return c.ScopePrefix + "-" + c.Name + ".scope"
  295. }
  296. return c.Name
  297. }
  298. // This code should be in sync with getUnitName.
  299. func getUnitType(unitName string) string {
  300. if strings.HasSuffix(unitName, ".slice") {
  301. return "Slice"
  302. }
  303. return "Scope"
  304. }
  305. // isDbusError returns true if the error is a specific dbus error.
  306. func isDbusError(err error, name string) bool {
  307. if err != nil {
  308. var derr dbus.Error
  309. if errors.As(err, &derr) {
  310. return strings.Contains(derr.Name, name)
  311. }
  312. }
  313. return false
  314. }
  315. // isUnitExists returns true if the error is that a systemd unit already exists.
  316. func isUnitExists(err error) bool {
  317. return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
  318. }
  319. func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error {
  320. statusChan := make(chan string, 1)
  321. err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
  322. _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
  323. return err
  324. })
  325. if err == nil {
  326. timeout := time.NewTimer(30 * time.Second)
  327. defer timeout.Stop()
  328. select {
  329. case s := <-statusChan:
  330. close(statusChan)
  331. // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
  332. if s != "done" {
  333. resetFailedUnit(cm, unitName)
  334. return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
  335. }
  336. case <-timeout.C:
  337. resetFailedUnit(cm, unitName)
  338. return errors.New("Timeout waiting for systemd to create " + unitName)
  339. }
  340. } else if !isUnitExists(err) {
  341. return err
  342. }
  343. return nil
  344. }
  345. func stopUnit(cm *dbusConnManager, unitName string) error {
  346. statusChan := make(chan string, 1)
  347. err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
  348. _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
  349. return err
  350. })
  351. if err == nil {
  352. timeout := time.NewTimer(30 * time.Second)
  353. defer timeout.Stop()
  354. select {
  355. case s := <-statusChan:
  356. close(statusChan)
  357. // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
  358. if s != "done" {
  359. logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
  360. }
  361. case <-timeout.C:
  362. return errors.New("Timed out while waiting for systemd to remove " + unitName)
  363. }
  364. }
  365. return nil
  366. }
  367. func resetFailedUnit(cm *dbusConnManager, name string) {
  368. err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
  369. return c.ResetFailedUnitContext(context.TODO(), name)
  370. })
  371. if err != nil {
  372. logrus.Warnf("unable to reset failed unit: %v", err)
  373. }
  374. }
  375. func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
  376. var prop *systemdDbus.Property
  377. err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) {
  378. prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName)
  379. return Err
  380. })
  381. return prop, err
  382. }
  383. func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
  384. return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
  385. return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
  386. })
  387. }
  388. func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
  389. str := ""
  390. err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
  391. var err error
  392. str, err = c.GetManagerProperty(name)
  393. return err
  394. })
  395. if err != nil {
  396. return "", err
  397. }
  398. return strconv.Unquote(str)
  399. }
  400. func systemdVersion(cm *dbusConnManager) int {
  401. versionOnce.Do(func() {
  402. version = -1
  403. verStr, err := getManagerProperty(cm, "Version")
  404. if err == nil {
  405. version, err = systemdVersionAtoi(verStr)
  406. }
  407. if err != nil {
  408. logrus.WithError(err).Error("unable to get systemd version")
  409. }
  410. })
  411. return version
  412. }
  413. func systemdVersionAtoi(verStr string) (int, error) {
  414. // verStr should be of the form:
  415. // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
  416. // The result for all of the above should be 245.
  417. // Thus, we unconditionally remove the "v" prefix
  418. // and then match on the first integer we can grab.
  419. re := regexp.MustCompile(`v?([0-9]+)`)
  420. matches := re.FindStringSubmatch(verStr)
  421. if len(matches) < 2 {
  422. return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
  423. }
  424. ver, err := strconv.Atoi(matches[1])
  425. if err != nil {
  426. return -1, fmt.Errorf("can't parse version: %w", err)
  427. }
  428. return ver, nil
  429. }
  430. func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
  431. if period != 0 {
  432. // systemd only supports CPUQuotaPeriodUSec since v242
  433. sdVer := systemdVersion(cm)
  434. if sdVer >= 242 {
  435. *properties = append(*properties,
  436. newProp("CPUQuotaPeriodUSec", period))
  437. } else {
  438. logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
  439. " (setting will still be applied to cgroupfs)", sdVer)
  440. }
  441. }
  442. if quota != 0 || period != 0 {
  443. // corresponds to USEC_INFINITY in systemd
  444. cpuQuotaPerSecUSec := uint64(math.MaxUint64)
  445. if quota > 0 {
  446. if period == 0 {
  447. // assume the default
  448. period = defCPUQuotaPeriod
  449. }
  450. // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
  451. // (integer percentage of CPU) internally. This means that if a fractional percent of
  452. // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
  453. // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
  454. cpuQuotaPerSecUSec = uint64(quota*1000000) / period
  455. if cpuQuotaPerSecUSec%10000 != 0 {
  456. cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
  457. }
  458. }
  459. *properties = append(*properties,
  460. newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
  461. }
  462. }
  463. func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
  464. if cpus == "" && mems == "" {
  465. return nil
  466. }
  467. // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
  468. sdVer := systemdVersion(cm)
  469. if sdVer < 244 {
  470. logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
  471. " (settings will still be applied to cgroupfs)", sdVer)
  472. return nil
  473. }
  474. if cpus != "" {
  475. bits, err := RangeToBits(cpus)
  476. if err != nil {
  477. return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
  478. cpus, err)
  479. }
  480. *props = append(*props,
  481. newProp("AllowedCPUs", bits))
  482. }
  483. if mems != "" {
  484. bits, err := RangeToBits(mems)
  485. if err != nil {
  486. return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
  487. mems, err)
  488. }
  489. *props = append(*props,
  490. newProp("AllowedMemoryNodes", bits))
  491. }
  492. return nil
  493. }