collector.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. //go:build linux
  2. // +build linux
  3. // Copyright 2021 Google Inc. All Rights Reserved.
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License");
  6. // you may not use this file except in compliance with the License.
  7. // You may obtain a copy of the License at
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. // Collector of resctrl for a container.
  17. package resctrl
  18. import (
  19. "fmt"
  20. "os"
  21. "path/filepath"
  22. "strings"
  23. "sync"
  24. "time"
  25. "k8s.io/klog/v2"
  26. info "github.com/google/cadvisor/info/v1"
  27. )
  28. const noInterval = 0
  29. type collector struct {
  30. id string
  31. interval time.Duration
  32. getContainerPids func() ([]string, error)
  33. resctrlPath string
  34. running bool
  35. destroyed bool
  36. numberOfNUMANodes int
  37. vendorID string
  38. mu sync.Mutex
  39. inHostNamespace bool
  40. }
  41. func newCollector(id string, getContainerPids func() ([]string, error), interval time.Duration, numberOfNUMANodes int, vendorID string, inHostNamespace bool) *collector {
  42. return &collector{id: id, interval: interval, getContainerPids: getContainerPids, numberOfNUMANodes: numberOfNUMANodes,
  43. vendorID: vendorID, mu: sync.Mutex{}, inHostNamespace: inHostNamespace}
  44. }
  45. func (c *collector) setup() error {
  46. var err error
  47. c.resctrlPath, err = prepareMonitoringGroup(c.id, c.getContainerPids, c.inHostNamespace)
  48. if c.interval != noInterval {
  49. if err != nil {
  50. klog.Errorf("Failed to setup container %q resctrl collector: %s \n Trying again in next intervals.", c.id, err)
  51. } else {
  52. c.running = true
  53. }
  54. go func() {
  55. for {
  56. time.Sleep(c.interval)
  57. c.mu.Lock()
  58. if c.destroyed {
  59. break
  60. }
  61. klog.V(5).Infof("Trying to check %q containers control group.", c.id)
  62. if c.running {
  63. err = c.checkMonitoringGroup()
  64. if err != nil {
  65. c.running = false
  66. klog.Errorf("Failed to check %q resctrl collector control group: %s \n Trying again in next intervals.", c.id, err)
  67. }
  68. } else {
  69. c.resctrlPath, err = prepareMonitoringGroup(c.id, c.getContainerPids, c.inHostNamespace)
  70. if err != nil {
  71. c.running = false
  72. klog.Errorf("Failed to setup container %q resctrl collector: %s \n Trying again in next intervals.", c.id, err)
  73. }
  74. }
  75. c.mu.Unlock()
  76. }
  77. }()
  78. } else {
  79. // There is no interval set, if setup fail, stop.
  80. if err != nil {
  81. return fmt.Errorf("failed to setup container %q resctrl collector: %w", c.id, err)
  82. }
  83. c.running = true
  84. }
  85. return nil
  86. }
  87. func (c *collector) checkMonitoringGroup() error {
  88. newPath, err := prepareMonitoringGroup(c.id, c.getContainerPids, c.inHostNamespace)
  89. if err != nil {
  90. return fmt.Errorf("couldn't obtain mon_group path: %v", err)
  91. }
  92. // Check if container moved between control groups.
  93. if newPath != c.resctrlPath {
  94. err = c.clear()
  95. if err != nil {
  96. return fmt.Errorf("couldn't clear previous monitoring group: %w", err)
  97. }
  98. c.resctrlPath = newPath
  99. }
  100. return nil
  101. }
  102. func (c *collector) UpdateStats(stats *info.ContainerStats) error {
  103. c.mu.Lock()
  104. defer c.mu.Unlock()
  105. if c.running {
  106. stats.Resctrl = info.ResctrlStats{}
  107. resctrlStats, err := getIntelRDTStatsFrom(c.resctrlPath, c.vendorID)
  108. if err != nil {
  109. return err
  110. }
  111. stats.Resctrl.MemoryBandwidth = make([]info.MemoryBandwidthStats, 0, c.numberOfNUMANodes)
  112. stats.Resctrl.Cache = make([]info.CacheStats, 0, c.numberOfNUMANodes)
  113. for _, numaNodeStats := range *resctrlStats.MBMStats {
  114. stats.Resctrl.MemoryBandwidth = append(stats.Resctrl.MemoryBandwidth,
  115. info.MemoryBandwidthStats{
  116. TotalBytes: numaNodeStats.MBMTotalBytes,
  117. LocalBytes: numaNodeStats.MBMLocalBytes,
  118. })
  119. }
  120. for _, numaNodeStats := range *resctrlStats.CMTStats {
  121. stats.Resctrl.Cache = append(stats.Resctrl.Cache,
  122. info.CacheStats{LLCOccupancy: numaNodeStats.LLCOccupancy})
  123. }
  124. }
  125. return nil
  126. }
  127. func (c *collector) Destroy() {
  128. c.mu.Lock()
  129. defer c.mu.Unlock()
  130. c.running = false
  131. err := c.clear()
  132. if err != nil {
  133. klog.Errorf("trying to destroy %q resctrl collector but: %v", c.id, err)
  134. }
  135. c.destroyed = true
  136. }
  137. func (c *collector) clear() error {
  138. // Not allowed to remove root or undefined resctrl directory.
  139. if c.id != rootContainer && c.resctrlPath != "" {
  140. // Remove only own prepared mon group.
  141. if strings.HasPrefix(filepath.Base(c.resctrlPath), monGroupPrefix) {
  142. err := os.RemoveAll(c.resctrlPath)
  143. if err != nil {
  144. return fmt.Errorf("couldn't clear mon_group: %v", err)
  145. }
  146. }
  147. }
  148. return nil
  149. }