freezer.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. package fs
  2. import (
  3. "errors"
  4. "fmt"
  5. "os"
  6. "strings"
  7. "time"
  8. "github.com/opencontainers/runc/libcontainer/cgroups"
  9. "github.com/opencontainers/runc/libcontainer/configs"
  10. "github.com/sirupsen/logrus"
  11. "golang.org/x/sys/unix"
  12. )
  13. type FreezerGroup struct{}
  14. func (s *FreezerGroup) Name() string {
  15. return "freezer"
  16. }
  17. func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error {
  18. return apply(path, pid)
  19. }
  20. func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
  21. switch r.Freezer {
  22. case configs.Frozen:
  23. defer func() {
  24. if Err != nil {
  25. // Freezing failed, and it is bad and dangerous
  26. // to leave the cgroup in FROZEN or FREEZING
  27. // state, so (try to) thaw it back.
  28. _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
  29. }
  30. }()
  31. // As per older kernel docs (freezer-subsystem.txt before
  32. // kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
  33. // userspace should either retry or thaw. While current
  34. // kernel cgroup v1 docs no longer mention a need to retry,
  35. // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
  36. // freeze a cgroup v1 while new processes keep appearing in it
  37. // (either via fork/clone or by writing new PIDs to
  38. // cgroup.procs).
  39. //
  40. // The numbers below are empirically chosen to have a decent
  41. // chance to succeed in various scenarios ("runc pause/unpause
  42. // with parallel runc exec" and "bare freeze/unfreeze on a very
  43. // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
  44. //
  45. // Adding any amount of sleep in between retries did not
  46. // increase the chances of successful freeze in "pause/unpause
  47. // with parallel exec" reproducer. OTOH, adding an occasional
  48. // sleep helped for the case where the system is extremely slow
  49. // (CentOS 7 VM on GHA CI).
  50. //
  51. // Alas, this is still a game of chances, since the real fix
  52. // belong to the kernel (cgroup v2 do not have this bug).
  53. for i := 0; i < 1000; i++ {
  54. if i%50 == 49 {
  55. // Occasional thaw and sleep improves
  56. // the chances to succeed in freezing
  57. // in case new processes keep appearing
  58. // in the cgroup.
  59. _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
  60. time.Sleep(10 * time.Millisecond)
  61. }
  62. if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
  63. return err
  64. }
  65. if i%25 == 24 {
  66. // Occasional short sleep before reading
  67. // the state back also improves the chances to
  68. // succeed in freezing in case of a very slow
  69. // system.
  70. time.Sleep(10 * time.Microsecond)
  71. }
  72. state, err := cgroups.ReadFile(path, "freezer.state")
  73. if err != nil {
  74. return err
  75. }
  76. state = strings.TrimSpace(state)
  77. switch state {
  78. case "FREEZING":
  79. continue
  80. case string(configs.Frozen):
  81. if i > 1 {
  82. logrus.Debugf("frozen after %d retries", i)
  83. }
  84. return nil
  85. default:
  86. // should never happen
  87. return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
  88. }
  89. }
  90. // Despite our best efforts, it got stuck in FREEZING.
  91. return errors.New("unable to freeze")
  92. case configs.Thawed:
  93. return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
  94. case configs.Undefined:
  95. return nil
  96. default:
  97. return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
  98. }
  99. }
  100. func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
  101. return nil
  102. }
  103. func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
  104. for {
  105. state, err := cgroups.ReadFile(path, "freezer.state")
  106. if err != nil {
  107. // If the kernel is too old, then we just treat the freezer as
  108. // being in an "undefined" state.
  109. if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
  110. err = nil
  111. }
  112. return configs.Undefined, err
  113. }
  114. switch strings.TrimSpace(state) {
  115. case "THAWED":
  116. return configs.Thawed, nil
  117. case "FROZEN":
  118. // Find out whether the cgroup is frozen directly,
  119. // or indirectly via an ancestor.
  120. self, err := cgroups.ReadFile(path, "freezer.self_freezing")
  121. if err != nil {
  122. // If the kernel is too old, then we just treat
  123. // it as being frozen.
  124. if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
  125. err = nil
  126. }
  127. return configs.Frozen, err
  128. }
  129. switch self {
  130. case "0\n":
  131. return configs.Thawed, nil
  132. case "1\n":
  133. return configs.Frozen, nil
  134. default:
  135. return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
  136. }
  137. case "FREEZING":
  138. // Make sure we get a stable freezer state, so retry if the cgroup
  139. // is still undergoing freezing. This should be a temporary delay.
  140. time.Sleep(1 * time.Millisecond)
  141. continue
  142. default:
  143. return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
  144. }
  145. }
  146. }