container.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package manager
  15. import (
  16. "flag"
  17. "fmt"
  18. "io/ioutil"
  19. "math"
  20. "math/rand"
  21. "os/exec"
  22. "path"
  23. "regexp"
  24. "sort"
  25. "strconv"
  26. "strings"
  27. "sync"
  28. "sync/atomic"
  29. "time"
  30. "github.com/google/cadvisor/cache/memory"
  31. "github.com/google/cadvisor/collector"
  32. "github.com/google/cadvisor/container"
  33. info "github.com/google/cadvisor/info/v1"
  34. v2 "github.com/google/cadvisor/info/v2"
  35. "github.com/google/cadvisor/stats"
  36. "github.com/google/cadvisor/summary"
  37. "github.com/google/cadvisor/utils/cpuload"
  38. "github.com/docker/go-units"
  39. "k8s.io/klog/v2"
  40. "k8s.io/utils/clock"
  41. )
  42. // Housekeeping interval.
  43. var enableLoadReader = flag.Bool("enable_load_reader", false, "Whether to enable cpu load reader")
  44. var HousekeepingInterval = flag.Duration("housekeeping_interval", 1*time.Second, "Interval between container housekeepings")
  45. // TODO: replace regular expressions with something simpler, such as strings.Split().
  46. // cgroup type chosen to fetch the cgroup path of a process.
  47. // Memory has been chosen, as it is one of the default cgroups that is enabled for most containers...
  48. var cgroupMemoryPathRegExp = regexp.MustCompile(`memory[^:]*:(.*?)[,;$]`)
  49. // ... but there are systems (e.g. Raspberry Pi 4) where memory cgroup controller is disabled by default.
  50. // We should check cpu cgroup then.
  51. var cgroupCPUPathRegExp = regexp.MustCompile(`cpu[^:]*:(.*?)[,;$]`)
  52. type containerInfo struct {
  53. info.ContainerReference
  54. Subcontainers []info.ContainerReference
  55. Spec info.ContainerSpec
  56. }
  57. type containerData struct {
  58. oomEvents uint64
  59. handler container.ContainerHandler
  60. info containerInfo
  61. memoryCache *memory.InMemoryCache
  62. lock sync.Mutex
  63. loadReader cpuload.CpuLoadReader
  64. summaryReader *summary.StatsSummary
  65. loadAvg float64 // smoothed load average seen so far.
  66. housekeepingInterval time.Duration
  67. maxHousekeepingInterval time.Duration
  68. allowDynamicHousekeeping bool
  69. infoLastUpdatedTime time.Time
  70. statsLastUpdatedTime time.Time
  71. lastErrorTime time.Time
  72. // used to track time
  73. clock clock.Clock
  74. // Decay value used for load average smoothing. Interval length of 10 seconds is used.
  75. loadDecay float64
  76. // Whether to log the usage of this container when it is updated.
  77. logUsage bool
  78. // Tells the container to stop.
  79. stop chan struct{}
  80. // Tells the container to immediately collect stats
  81. onDemandChan chan chan struct{}
  82. // Runs custom metric collectors.
  83. collectorManager collector.CollectorManager
  84. // nvidiaCollector updates stats for Nvidia GPUs attached to the container.
  85. nvidiaCollector stats.Collector
  86. // perfCollector updates stats for perf_event cgroup controller.
  87. perfCollector stats.Collector
  88. // resctrlCollector updates stats for resctrl controller.
  89. resctrlCollector stats.Collector
  90. }
  91. // jitter returns a time.Duration between duration and duration + maxFactor * duration,
  92. // to allow clients to avoid converging on periodic behavior. If maxFactor is 0.0, a
  93. // suggested default value will be chosen.
  94. func jitter(duration time.Duration, maxFactor float64) time.Duration {
  95. if maxFactor <= 0.0 {
  96. maxFactor = 1.0
  97. }
  98. wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
  99. return wait
  100. }
  101. func (cd *containerData) Start() error {
  102. go cd.housekeeping()
  103. return nil
  104. }
  105. func (cd *containerData) Stop() error {
  106. err := cd.memoryCache.RemoveContainer(cd.info.Name)
  107. if err != nil {
  108. return err
  109. }
  110. close(cd.stop)
  111. cd.perfCollector.Destroy()
  112. cd.resctrlCollector.Destroy()
  113. return nil
  114. }
  115. func (cd *containerData) allowErrorLogging() bool {
  116. if cd.clock.Since(cd.lastErrorTime) > time.Minute {
  117. cd.lastErrorTime = cd.clock.Now()
  118. return true
  119. }
  120. return false
  121. }
  122. // OnDemandHousekeeping performs housekeeping on the container and blocks until it has completed.
  123. // It is designed to be used in conjunction with periodic housekeeping, and will cause the timer for
  124. // periodic housekeeping to reset. This should be used sparingly, as calling OnDemandHousekeeping frequently
  125. // can have serious performance costs.
  126. func (cd *containerData) OnDemandHousekeeping(maxAge time.Duration) {
  127. cd.lock.Lock()
  128. timeSinceStatsLastUpdate := cd.clock.Since(cd.statsLastUpdatedTime)
  129. cd.lock.Unlock()
  130. if timeSinceStatsLastUpdate > maxAge {
  131. housekeepingFinishedChan := make(chan struct{})
  132. cd.onDemandChan <- housekeepingFinishedChan
  133. select {
  134. case <-cd.stop:
  135. case <-housekeepingFinishedChan:
  136. }
  137. }
  138. }
  139. // notifyOnDemand notifies all calls to OnDemandHousekeeping that housekeeping is finished
  140. func (cd *containerData) notifyOnDemand() {
  141. for {
  142. select {
  143. case finishedChan := <-cd.onDemandChan:
  144. close(finishedChan)
  145. default:
  146. return
  147. }
  148. }
  149. }
  150. func (cd *containerData) GetInfo(shouldUpdateSubcontainers bool) (*containerInfo, error) {
  151. // Get spec and subcontainers.
  152. if cd.clock.Since(cd.infoLastUpdatedTime) > 5*time.Second || shouldUpdateSubcontainers {
  153. err := cd.updateSpec()
  154. if err != nil {
  155. return nil, err
  156. }
  157. if shouldUpdateSubcontainers {
  158. err = cd.updateSubcontainers()
  159. if err != nil {
  160. return nil, err
  161. }
  162. }
  163. cd.infoLastUpdatedTime = cd.clock.Now()
  164. }
  165. cd.lock.Lock()
  166. defer cd.lock.Unlock()
  167. cInfo := containerInfo{
  168. Subcontainers: cd.info.Subcontainers,
  169. Spec: cd.info.Spec,
  170. }
  171. cInfo.Id = cd.info.Id
  172. cInfo.Name = cd.info.Name
  173. cInfo.Aliases = cd.info.Aliases
  174. cInfo.Namespace = cd.info.Namespace
  175. return &cInfo, nil
  176. }
  177. func (cd *containerData) DerivedStats() (v2.DerivedStats, error) {
  178. if cd.summaryReader == nil {
  179. return v2.DerivedStats{}, fmt.Errorf("derived stats not enabled for container %q", cd.info.Name)
  180. }
  181. return cd.summaryReader.DerivedStats()
  182. }
  183. func (cd *containerData) getCgroupPath(cgroups string) string {
  184. if cgroups == "-" {
  185. return "/"
  186. }
  187. if strings.HasPrefix(cgroups, "0::") {
  188. return cgroups[3:]
  189. }
  190. matches := cgroupMemoryPathRegExp.FindSubmatch([]byte(cgroups))
  191. if len(matches) != 2 {
  192. klog.V(3).Infof(
  193. "failed to get memory cgroup path from %q, will try to get cpu cgroup path",
  194. cgroups,
  195. )
  196. // On some systems (e.g. Raspberry PI 4) cgroup memory controlled is disabled by default.
  197. matches = cgroupCPUPathRegExp.FindSubmatch([]byte(cgroups))
  198. if len(matches) != 2 {
  199. klog.V(3).Infof("failed to get cpu cgroup path from %q; assuming root cgroup", cgroups)
  200. // return root in case of failures - memory hierarchy might not be enabled.
  201. return "/"
  202. }
  203. }
  204. return string(matches[1])
  205. }
  206. // Returns contents of a file inside the container root.
  207. // Takes in a path relative to container root.
  208. func (cd *containerData) ReadFile(filepath string, inHostNamespace bool) ([]byte, error) {
  209. pids, err := cd.getContainerPids(inHostNamespace)
  210. if err != nil {
  211. return nil, err
  212. }
  213. // TODO(rjnagal): Optimize by just reading container's cgroup.proc file when in host namespace.
  214. rootfs := "/"
  215. if !inHostNamespace {
  216. rootfs = "/rootfs"
  217. }
  218. for _, pid := range pids {
  219. filePath := path.Join(rootfs, "/proc", pid, "/root", filepath)
  220. klog.V(3).Infof("Trying path %q", filePath)
  221. data, err := ioutil.ReadFile(filePath)
  222. if err == nil {
  223. return data, err
  224. }
  225. }
  226. // No process paths could be found. Declare config non-existent.
  227. return nil, fmt.Errorf("file %q does not exist", filepath)
  228. }
  229. // Return output for ps command in host /proc with specified format
  230. func (cd *containerData) getPsOutput(inHostNamespace bool, format string) ([]byte, error) {
  231. args := []string{}
  232. command := "ps"
  233. if !inHostNamespace {
  234. command = "/usr/sbin/chroot"
  235. args = append(args, "/rootfs", "ps")
  236. }
  237. args = append(args, "-e", "-o", format)
  238. out, err := exec.Command(command, args...).Output()
  239. if err != nil {
  240. return nil, fmt.Errorf("failed to execute %q command: %v", command, err)
  241. }
  242. return out, err
  243. }
  244. // Get pids of processes in this container.
  245. // A slightly lighterweight call than GetProcessList if other details are not required.
  246. func (cd *containerData) getContainerPids(inHostNamespace bool) ([]string, error) {
  247. format := "pid,cgroup"
  248. out, err := cd.getPsOutput(inHostNamespace, format)
  249. if err != nil {
  250. return nil, err
  251. }
  252. expectedFields := 2
  253. lines := strings.Split(string(out), "\n")
  254. pids := []string{}
  255. for _, line := range lines[1:] {
  256. if len(line) == 0 {
  257. continue
  258. }
  259. fields := strings.Fields(line)
  260. if len(fields) < expectedFields {
  261. return nil, fmt.Errorf("expected at least %d fields, found %d: output: %q", expectedFields, len(fields), line)
  262. }
  263. pid := fields[0]
  264. cgroup := cd.getCgroupPath(fields[1])
  265. if cd.info.Name == cgroup {
  266. pids = append(pids, pid)
  267. }
  268. }
  269. return pids, nil
  270. }
  271. func (cd *containerData) GetProcessList(cadvisorContainer string, inHostNamespace bool) ([]v2.ProcessInfo, error) {
  272. format := "user,pid,ppid,stime,pcpu,pmem,rss,vsz,stat,time,comm,psr,cgroup"
  273. out, err := cd.getPsOutput(inHostNamespace, format)
  274. if err != nil {
  275. return nil, err
  276. }
  277. return cd.parseProcessList(cadvisorContainer, inHostNamespace, out)
  278. }
  279. func (cd *containerData) parseProcessList(cadvisorContainer string, inHostNamespace bool, out []byte) ([]v2.ProcessInfo, error) {
  280. rootfs := "/"
  281. if !inHostNamespace {
  282. rootfs = "/rootfs"
  283. }
  284. processes := []v2.ProcessInfo{}
  285. lines := strings.Split(string(out), "\n")
  286. for _, line := range lines[1:] {
  287. processInfo, err := cd.parsePsLine(line, cadvisorContainer, inHostNamespace)
  288. if err != nil {
  289. return nil, fmt.Errorf("could not parse line %s: %v", line, err)
  290. }
  291. if processInfo == nil {
  292. continue
  293. }
  294. var fdCount int
  295. dirPath := path.Join(rootfs, "/proc", strconv.Itoa(processInfo.Pid), "fd")
  296. fds, err := ioutil.ReadDir(dirPath)
  297. if err != nil {
  298. klog.V(4).Infof("error while listing directory %q to measure fd count: %v", dirPath, err)
  299. continue
  300. }
  301. fdCount = len(fds)
  302. processInfo.FdCount = fdCount
  303. processes = append(processes, *processInfo)
  304. }
  305. return processes, nil
  306. }
  307. func (cd *containerData) isRoot() bool {
  308. return cd.info.Name == "/"
  309. }
  310. func (cd *containerData) parsePsLine(line, cadvisorContainer string, inHostNamespace bool) (*v2.ProcessInfo, error) {
  311. const expectedFields = 13
  312. if len(line) == 0 {
  313. return nil, nil
  314. }
  315. info := v2.ProcessInfo{}
  316. var err error
  317. fields := strings.Fields(line)
  318. if len(fields) < expectedFields {
  319. return nil, fmt.Errorf("expected at least %d fields, found %d: output: %q", expectedFields, len(fields), line)
  320. }
  321. info.User = fields[0]
  322. info.StartTime = fields[3]
  323. info.Status = fields[8]
  324. info.RunningTime = fields[9]
  325. info.Pid, err = strconv.Atoi(fields[1])
  326. if err != nil {
  327. return nil, fmt.Errorf("invalid pid %q: %v", fields[1], err)
  328. }
  329. info.Ppid, err = strconv.Atoi(fields[2])
  330. if err != nil {
  331. return nil, fmt.Errorf("invalid ppid %q: %v", fields[2], err)
  332. }
  333. percentCPU, err := strconv.ParseFloat(fields[4], 32)
  334. if err != nil {
  335. return nil, fmt.Errorf("invalid cpu percent %q: %v", fields[4], err)
  336. }
  337. info.PercentCpu = float32(percentCPU)
  338. percentMem, err := strconv.ParseFloat(fields[5], 32)
  339. if err != nil {
  340. return nil, fmt.Errorf("invalid memory percent %q: %v", fields[5], err)
  341. }
  342. info.PercentMemory = float32(percentMem)
  343. info.RSS, err = strconv.ParseUint(fields[6], 0, 64)
  344. if err != nil {
  345. return nil, fmt.Errorf("invalid rss %q: %v", fields[6], err)
  346. }
  347. info.VirtualSize, err = strconv.ParseUint(fields[7], 0, 64)
  348. if err != nil {
  349. return nil, fmt.Errorf("invalid virtual size %q: %v", fields[7], err)
  350. }
  351. // convert to bytes
  352. info.RSS *= 1024
  353. info.VirtualSize *= 1024
  354. // According to `man ps`: The following user-defined format specifiers may contain spaces: args, cmd, comm, command,
  355. // fname, ucmd, ucomm, lstart, bsdstart, start.
  356. // Therefore we need to be able to parse comm that consists of multiple space-separated parts.
  357. info.Cmd = strings.Join(fields[10:len(fields)-2], " ")
  358. // These are last two parts of the line. We create a subslice of `fields` to handle comm that includes spaces.
  359. lastTwoFields := fields[len(fields)-2:]
  360. info.Psr, err = strconv.Atoi(lastTwoFields[0])
  361. if err != nil {
  362. return nil, fmt.Errorf("invalid psr %q: %v", lastTwoFields[0], err)
  363. }
  364. info.CgroupPath = cd.getCgroupPath(lastTwoFields[1])
  365. // Remove the ps command we just ran from cadvisor container.
  366. // Not necessary, but makes the cadvisor page look cleaner.
  367. if !inHostNamespace && cadvisorContainer == info.CgroupPath && info.Cmd == "ps" {
  368. return nil, nil
  369. }
  370. // Do not report processes from other containers when non-root container requested.
  371. if !cd.isRoot() && info.CgroupPath != cd.info.Name {
  372. return nil, nil
  373. }
  374. // Remove cgroup information when non-root container requested.
  375. if !cd.isRoot() {
  376. info.CgroupPath = ""
  377. }
  378. return &info, nil
  379. }
  380. func newContainerData(containerName string, memoryCache *memory.InMemoryCache, handler container.ContainerHandler, logUsage bool, collectorManager collector.CollectorManager, maxHousekeepingInterval time.Duration, allowDynamicHousekeeping bool, clock clock.Clock) (*containerData, error) {
  381. if memoryCache == nil {
  382. return nil, fmt.Errorf("nil memory storage")
  383. }
  384. if handler == nil {
  385. return nil, fmt.Errorf("nil container handler")
  386. }
  387. ref, err := handler.ContainerReference()
  388. if err != nil {
  389. return nil, err
  390. }
  391. cont := &containerData{
  392. handler: handler,
  393. memoryCache: memoryCache,
  394. housekeepingInterval: *HousekeepingInterval,
  395. maxHousekeepingInterval: maxHousekeepingInterval,
  396. allowDynamicHousekeeping: allowDynamicHousekeeping,
  397. logUsage: logUsage,
  398. loadAvg: -1.0, // negative value indicates uninitialized.
  399. stop: make(chan struct{}),
  400. collectorManager: collectorManager,
  401. onDemandChan: make(chan chan struct{}, 100),
  402. clock: clock,
  403. perfCollector: &stats.NoopCollector{},
  404. nvidiaCollector: &stats.NoopCollector{},
  405. resctrlCollector: &stats.NoopCollector{},
  406. }
  407. cont.info.ContainerReference = ref
  408. cont.loadDecay = math.Exp(float64(-cont.housekeepingInterval.Seconds() / 10))
  409. if *enableLoadReader {
  410. // Create cpu load reader.
  411. loadReader, err := cpuload.New()
  412. if err != nil {
  413. klog.Warningf("Could not initialize cpu load reader for %q: %s", ref.Name, err)
  414. } else {
  415. cont.loadReader = loadReader
  416. }
  417. }
  418. err = cont.updateSpec()
  419. if err != nil {
  420. return nil, err
  421. }
  422. cont.summaryReader, err = summary.New(cont.info.Spec)
  423. if err != nil {
  424. cont.summaryReader = nil
  425. klog.V(5).Infof("Failed to create summary reader for %q: %v", ref.Name, err)
  426. }
  427. return cont, nil
  428. }
  429. // Determine when the next housekeeping should occur.
  430. func (cd *containerData) nextHousekeepingInterval() time.Duration {
  431. if cd.allowDynamicHousekeeping {
  432. var empty time.Time
  433. stats, err := cd.memoryCache.RecentStats(cd.info.Name, empty, empty, 2)
  434. if err != nil {
  435. if cd.allowErrorLogging() {
  436. klog.Warningf("Failed to get RecentStats(%q) while determining the next housekeeping: %v", cd.info.Name, err)
  437. }
  438. } else if len(stats) == 2 {
  439. // TODO(vishnuk): Use no processes as a signal.
  440. // Raise the interval if usage hasn't changed in the last housekeeping.
  441. if stats[0].StatsEq(stats[1]) && (cd.housekeepingInterval < cd.maxHousekeepingInterval) {
  442. cd.housekeepingInterval *= 2
  443. if cd.housekeepingInterval > cd.maxHousekeepingInterval {
  444. cd.housekeepingInterval = cd.maxHousekeepingInterval
  445. }
  446. } else if cd.housekeepingInterval != *HousekeepingInterval {
  447. // Lower interval back to the baseline.
  448. cd.housekeepingInterval = *HousekeepingInterval
  449. }
  450. }
  451. }
  452. return jitter(cd.housekeepingInterval, 1.0)
  453. }
  454. // TODO(vmarmol): Implement stats collecting as a custom collector.
  455. func (cd *containerData) housekeeping() {
  456. // Start any background goroutines - must be cleaned up in cd.handler.Cleanup().
  457. cd.handler.Start()
  458. defer cd.handler.Cleanup()
  459. // Initialize cpuload reader - must be cleaned up in cd.loadReader.Stop()
  460. if cd.loadReader != nil {
  461. err := cd.loadReader.Start()
  462. if err != nil {
  463. klog.Warningf("Could not start cpu load stat collector for %q: %s", cd.info.Name, err)
  464. }
  465. defer cd.loadReader.Stop()
  466. }
  467. // Long housekeeping is either 100ms or half of the housekeeping interval.
  468. longHousekeeping := 100 * time.Millisecond
  469. if *HousekeepingInterval/2 < longHousekeeping {
  470. longHousekeeping = *HousekeepingInterval / 2
  471. }
  472. // Housekeep every second.
  473. klog.V(3).Infof("Start housekeeping for container %q\n", cd.info.Name)
  474. houseKeepingTimer := cd.clock.NewTimer(0 * time.Second)
  475. defer houseKeepingTimer.Stop()
  476. for {
  477. if !cd.housekeepingTick(houseKeepingTimer.C(), longHousekeeping) {
  478. return
  479. }
  480. // Stop and drain the timer so that it is safe to reset it
  481. if !houseKeepingTimer.Stop() {
  482. select {
  483. case <-houseKeepingTimer.C():
  484. default:
  485. }
  486. }
  487. // Log usage if asked to do so.
  488. if cd.logUsage {
  489. const numSamples = 60
  490. var empty time.Time
  491. stats, err := cd.memoryCache.RecentStats(cd.info.Name, empty, empty, numSamples)
  492. if err != nil {
  493. if cd.allowErrorLogging() {
  494. klog.Warningf("[%s] Failed to get recent stats for logging usage: %v", cd.info.Name, err)
  495. }
  496. } else if len(stats) < numSamples {
  497. // Ignore, not enough stats yet.
  498. } else {
  499. usageCPUNs := uint64(0)
  500. for i := range stats {
  501. if i > 0 {
  502. usageCPUNs += stats[i].Cpu.Usage.Total - stats[i-1].Cpu.Usage.Total
  503. }
  504. }
  505. usageMemory := stats[numSamples-1].Memory.Usage
  506. instantUsageInCores := float64(stats[numSamples-1].Cpu.Usage.Total-stats[numSamples-2].Cpu.Usage.Total) / float64(stats[numSamples-1].Timestamp.Sub(stats[numSamples-2].Timestamp).Nanoseconds())
  507. usageInCores := float64(usageCPUNs) / float64(stats[numSamples-1].Timestamp.Sub(stats[0].Timestamp).Nanoseconds())
  508. usageInHuman := units.HumanSize(float64(usageMemory))
  509. // Don't set verbosity since this is already protected by the logUsage flag.
  510. klog.Infof("[%s] %.3f cores (average: %.3f cores), %s of memory", cd.info.Name, instantUsageInCores, usageInCores, usageInHuman)
  511. }
  512. }
  513. houseKeepingTimer.Reset(cd.nextHousekeepingInterval())
  514. }
  515. }
  516. func (cd *containerData) housekeepingTick(timer <-chan time.Time, longHousekeeping time.Duration) bool {
  517. select {
  518. case <-cd.stop:
  519. // Stop housekeeping when signaled.
  520. return false
  521. case finishedChan := <-cd.onDemandChan:
  522. // notify the calling function once housekeeping has completed
  523. defer close(finishedChan)
  524. case <-timer:
  525. }
  526. start := cd.clock.Now()
  527. err := cd.updateStats()
  528. if err != nil {
  529. if cd.allowErrorLogging() {
  530. klog.Warningf("Failed to update stats for container \"%s\": %s", cd.info.Name, err)
  531. }
  532. }
  533. // Log if housekeeping took too long.
  534. duration := cd.clock.Since(start)
  535. if duration >= longHousekeeping {
  536. klog.V(3).Infof("[%s] Housekeeping took %s", cd.info.Name, duration)
  537. }
  538. cd.notifyOnDemand()
  539. cd.lock.Lock()
  540. defer cd.lock.Unlock()
  541. cd.statsLastUpdatedTime = cd.clock.Now()
  542. return true
  543. }
  544. func (cd *containerData) updateSpec() error {
  545. spec, err := cd.handler.GetSpec()
  546. if err != nil {
  547. // Ignore errors if the container is dead.
  548. if !cd.handler.Exists() {
  549. return nil
  550. }
  551. return err
  552. }
  553. customMetrics, err := cd.collectorManager.GetSpec()
  554. if err != nil {
  555. return err
  556. }
  557. if len(customMetrics) > 0 {
  558. spec.HasCustomMetrics = true
  559. spec.CustomMetrics = customMetrics
  560. }
  561. cd.lock.Lock()
  562. defer cd.lock.Unlock()
  563. cd.info.Spec = spec
  564. return nil
  565. }
  566. // Calculate new smoothed load average using the new sample of runnable threads.
  567. // The decay used ensures that the load will stabilize on a new constant value within
  568. // 10 seconds.
  569. func (cd *containerData) updateLoad(newLoad uint64) {
  570. if cd.loadAvg < 0 {
  571. cd.loadAvg = float64(newLoad) // initialize to the first seen sample for faster stabilization.
  572. } else {
  573. cd.loadAvg = cd.loadAvg*cd.loadDecay + float64(newLoad)*(1.0-cd.loadDecay)
  574. }
  575. }
  576. func (cd *containerData) updateStats() error {
  577. stats, statsErr := cd.handler.GetStats()
  578. if statsErr != nil {
  579. // Ignore errors if the container is dead.
  580. if !cd.handler.Exists() {
  581. return nil
  582. }
  583. // Stats may be partially populated, push those before we return an error.
  584. statsErr = fmt.Errorf("%v, continuing to push stats", statsErr)
  585. }
  586. if stats == nil {
  587. return statsErr
  588. }
  589. if cd.loadReader != nil {
  590. // TODO(vmarmol): Cache this path.
  591. path, err := cd.handler.GetCgroupPath("cpu")
  592. if err == nil {
  593. loadStats, err := cd.loadReader.GetCpuLoad(cd.info.Name, path)
  594. if err != nil {
  595. return fmt.Errorf("failed to get load stat for %q - path %q, error %s", cd.info.Name, path, err)
  596. }
  597. stats.TaskStats = loadStats
  598. cd.updateLoad(loadStats.NrRunning)
  599. // convert to 'milliLoad' to avoid floats and preserve precision.
  600. stats.Cpu.LoadAverage = int32(cd.loadAvg * 1000)
  601. }
  602. }
  603. if cd.summaryReader != nil {
  604. err := cd.summaryReader.AddSample(*stats)
  605. if err != nil {
  606. // Ignore summary errors for now.
  607. klog.V(2).Infof("Failed to add summary stats for %q: %v", cd.info.Name, err)
  608. }
  609. }
  610. stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)
  611. var customStatsErr error
  612. cm := cd.collectorManager.(*collector.GenericCollectorManager)
  613. if len(cm.Collectors) > 0 {
  614. if cm.NextCollectionTime.Before(cd.clock.Now()) {
  615. customStats, err := cd.updateCustomStats()
  616. if customStats != nil {
  617. stats.CustomMetrics = customStats
  618. }
  619. if err != nil {
  620. customStatsErr = err
  621. }
  622. }
  623. }
  624. var nvidiaStatsErr error
  625. if cd.nvidiaCollector != nil {
  626. // This updates the Accelerators field of the stats struct
  627. nvidiaStatsErr = cd.nvidiaCollector.UpdateStats(stats)
  628. }
  629. perfStatsErr := cd.perfCollector.UpdateStats(stats)
  630. resctrlStatsErr := cd.resctrlCollector.UpdateStats(stats)
  631. ref, err := cd.handler.ContainerReference()
  632. if err != nil {
  633. // Ignore errors if the container is dead.
  634. if !cd.handler.Exists() {
  635. return nil
  636. }
  637. return err
  638. }
  639. cInfo := info.ContainerInfo{
  640. ContainerReference: ref,
  641. }
  642. err = cd.memoryCache.AddStats(&cInfo, stats)
  643. if err != nil {
  644. return err
  645. }
  646. if statsErr != nil {
  647. return statsErr
  648. }
  649. if nvidiaStatsErr != nil {
  650. klog.Errorf("error occurred while collecting nvidia stats for container %s: %s", cInfo.Name, err)
  651. return nvidiaStatsErr
  652. }
  653. if perfStatsErr != nil {
  654. klog.Errorf("error occurred while collecting perf stats for container %s: %s", cInfo.Name, err)
  655. return perfStatsErr
  656. }
  657. if resctrlStatsErr != nil {
  658. klog.Errorf("error occurred while collecting resctrl stats for container %s: %s", cInfo.Name, resctrlStatsErr)
  659. return resctrlStatsErr
  660. }
  661. return customStatsErr
  662. }
  663. func (cd *containerData) updateCustomStats() (map[string][]info.MetricVal, error) {
  664. _, customStats, customStatsErr := cd.collectorManager.Collect()
  665. if customStatsErr != nil {
  666. if !cd.handler.Exists() {
  667. return customStats, nil
  668. }
  669. customStatsErr = fmt.Errorf("%v, continuing to push custom stats", customStatsErr)
  670. }
  671. return customStats, customStatsErr
  672. }
  673. func (cd *containerData) updateSubcontainers() error {
  674. var subcontainers info.ContainerReferenceSlice
  675. subcontainers, err := cd.handler.ListContainers(container.ListSelf)
  676. if err != nil {
  677. // Ignore errors if the container is dead.
  678. if !cd.handler.Exists() {
  679. return nil
  680. }
  681. return err
  682. }
  683. sort.Sort(subcontainers)
  684. cd.lock.Lock()
  685. defer cd.lock.Unlock()
  686. cd.info.Subcontainers = subcontainers
  687. return nil
  688. }