manager.go 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Manager of cAdvisor-monitored containers.
  15. package manager
  16. import (
  17. "flag"
  18. "fmt"
  19. "net/http"
  20. "os"
  21. "path"
  22. "strconv"
  23. "strings"
  24. "sync"
  25. "sync/atomic"
  26. "time"
  27. "github.com/google/cadvisor/accelerators"
  28. "github.com/google/cadvisor/cache/memory"
  29. "github.com/google/cadvisor/collector"
  30. "github.com/google/cadvisor/container"
  31. "github.com/google/cadvisor/container/raw"
  32. "github.com/google/cadvisor/events"
  33. "github.com/google/cadvisor/fs"
  34. info "github.com/google/cadvisor/info/v1"
  35. v2 "github.com/google/cadvisor/info/v2"
  36. "github.com/google/cadvisor/machine"
  37. "github.com/google/cadvisor/nvm"
  38. "github.com/google/cadvisor/perf"
  39. "github.com/google/cadvisor/resctrl"
  40. "github.com/google/cadvisor/stats"
  41. "github.com/google/cadvisor/utils/oomparser"
  42. "github.com/google/cadvisor/utils/sysfs"
  43. "github.com/google/cadvisor/version"
  44. "github.com/google/cadvisor/watcher"
  45. "github.com/opencontainers/runc/libcontainer/cgroups"
  46. "k8s.io/klog/v2"
  47. "k8s.io/utils/clock"
  48. )
  49. var globalHousekeepingInterval = flag.Duration("global_housekeeping_interval", 1*time.Minute, "Interval between global housekeepings")
  50. var updateMachineInfoInterval = flag.Duration("update_machine_info_interval", 5*time.Minute, "Interval between machine info updates.")
  51. var logCadvisorUsage = flag.Bool("log_cadvisor_usage", false, "Whether to log the usage of the cAdvisor container")
  52. var eventStorageAgeLimit = flag.String("event_storage_age_limit", "default=24h", "Max length of time for which to store events (per type). Value is a comma separated list of key values, where the keys are event types (e.g.: creation, oom) or \"default\" and the value is a duration. Default is applied to all non-specified event types")
  53. var eventStorageEventLimit = flag.String("event_storage_event_limit", "default=100000", "Max number of events to store (per type). Value is a comma separated list of key values, where the keys are event types (e.g.: creation, oom) or \"default\" and the value is an integer. Default is applied to all non-specified event types")
  54. var applicationMetricsCountLimit = flag.Int("application_metrics_count_limit", 100, "Max number of application metrics to store (per container)")
  55. // The namespace under which Docker aliases are unique.
  56. const DockerNamespace = "docker"
  57. var HousekeepingConfigFlags = HouskeepingConfig{
  58. flag.Duration("max_housekeeping_interval", 60*time.Second, "Largest interval to allow between container housekeepings"),
  59. flag.Bool("allow_dynamic_housekeeping", true, "Whether to allow the housekeeping interval to be dynamic"),
  60. }
  61. // The Manager interface defines operations for starting a manager and getting
  62. // container and machine information.
  63. type Manager interface {
  64. // Start the manager. Calling other manager methods before this returns
  65. // may produce undefined behavior.
  66. Start() error
  67. // Stops the manager.
  68. Stop() error
  69. // information about a container.
  70. GetContainerInfo(containerName string, query *info.ContainerInfoRequest) (*info.ContainerInfo, error)
  71. // Get V2 information about a container.
  72. // Recursive (subcontainer) requests are best-effort, and may return a partial result alongside an
  73. // error in the partial failure case.
  74. GetContainerInfoV2(containerName string, options v2.RequestOptions) (map[string]v2.ContainerInfo, error)
  75. // Get information about all subcontainers of the specified container (includes self).
  76. SubcontainersInfo(containerName string, query *info.ContainerInfoRequest) ([]*info.ContainerInfo, error)
  77. // Gets all the Docker containers. Return is a map from full container name to ContainerInfo.
  78. AllDockerContainers(query *info.ContainerInfoRequest) (map[string]info.ContainerInfo, error)
  79. // Gets information about a specific Docker container. The specified name is within the Docker namespace.
  80. DockerContainer(dockerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error)
  81. // Gets spec for all containers based on request options.
  82. GetContainerSpec(containerName string, options v2.RequestOptions) (map[string]v2.ContainerSpec, error)
  83. // Gets summary stats for all containers based on request options.
  84. GetDerivedStats(containerName string, options v2.RequestOptions) (map[string]v2.DerivedStats, error)
  85. // Get info for all requested containers based on the request options.
  86. GetRequestedContainersInfo(containerName string, options v2.RequestOptions) (map[string]*info.ContainerInfo, error)
  87. // Returns true if the named container exists.
  88. Exists(containerName string) bool
  89. // Get information about the machine.
  90. GetMachineInfo() (*info.MachineInfo, error)
  91. // Get version information about different components we depend on.
  92. GetVersionInfo() (*info.VersionInfo, error)
  93. // GetFsInfoByFsUUID returns the information of the device having the
  94. // specified filesystem uuid. If no such device with the UUID exists, this
  95. // function will return the fs.ErrNoSuchDevice error.
  96. GetFsInfoByFsUUID(uuid string) (v2.FsInfo, error)
  97. // Get filesystem information for the filesystem that contains the given directory
  98. GetDirFsInfo(dir string) (v2.FsInfo, error)
  99. // Get filesystem information for a given label.
  100. // Returns information for all global filesystems if label is empty.
  101. GetFsInfo(label string) ([]v2.FsInfo, error)
  102. // Get ps output for a container.
  103. GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error)
  104. // Get events streamed through passedChannel that fit the request.
  105. WatchForEvents(request *events.Request) (*events.EventChannel, error)
  106. // Get past events that have been detected and that fit the request.
  107. GetPastEvents(request *events.Request) ([]*info.Event, error)
  108. CloseEventChannel(watchID int)
  109. // Returns debugging information. Map of lines per category.
  110. DebugInfo() map[string][]string
  111. }
  112. // Housekeeping configuration for the manager
  113. type HouskeepingConfig = struct {
  114. Interval *time.Duration
  115. AllowDynamic *bool
  116. }
  117. // New takes a memory storage and returns a new manager.
  118. func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) {
  119. if memoryCache == nil {
  120. return nil, fmt.Errorf("manager requires memory storage")
  121. }
  122. // Detect the container we are running on.
  123. selfContainer := "/"
  124. var err error
  125. // Avoid using GetOwnCgroupPath on cgroup v2 as it is not supported by libcontainer
  126. if !cgroups.IsCgroup2UnifiedMode() {
  127. selfContainer, err = cgroups.GetOwnCgroup("cpu")
  128. if err != nil {
  129. return nil, err
  130. }
  131. klog.V(2).Infof("cAdvisor running in container: %q", selfContainer)
  132. }
  133. context := fs.Context{}
  134. if err := container.InitializeFSContext(&context); err != nil {
  135. return nil, err
  136. }
  137. fsInfo, err := fs.NewFsInfo(context)
  138. if err != nil {
  139. return nil, err
  140. }
  141. // If cAdvisor was started with host's rootfs mounted, assume that its running
  142. // in its own namespaces.
  143. inHostNamespace := false
  144. if _, err := os.Stat("/rootfs/proc"); os.IsNotExist(err) {
  145. inHostNamespace = true
  146. }
  147. // Register for new subcontainers.
  148. eventsChannel := make(chan watcher.ContainerEvent, 16)
  149. newManager := &manager{
  150. containers: make(map[namespacedContainerName]*containerData),
  151. quitChannels: make([]chan error, 0, 2),
  152. memoryCache: memoryCache,
  153. fsInfo: fsInfo,
  154. sysFs: sysfs,
  155. cadvisorContainer: selfContainer,
  156. inHostNamespace: inHostNamespace,
  157. startupTime: time.Now(),
  158. maxHousekeepingInterval: *houskeepingConfig.Interval,
  159. allowDynamicHousekeeping: *houskeepingConfig.AllowDynamic,
  160. includedMetrics: includedMetricsSet,
  161. containerWatchers: []watcher.ContainerWatcher{},
  162. eventsChannel: eventsChannel,
  163. collectorHTTPClient: collectorHTTPClient,
  164. nvidiaManager: accelerators.NewNvidiaManager(includedMetricsSet),
  165. rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
  166. containerEnvMetadataWhiteList: containerEnvMetadataWhiteList,
  167. }
  168. machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
  169. if err != nil {
  170. return nil, err
  171. }
  172. newManager.machineInfo = *machineInfo
  173. klog.V(1).Infof("Machine: %+v", newManager.machineInfo)
  174. newManager.perfManager, err = perf.NewManager(perfEventsFile, machineInfo.Topology)
  175. if err != nil {
  176. return nil, err
  177. }
  178. newManager.resctrlManager, err = resctrl.NewManager(resctrlInterval, resctrl.Setup, machineInfo.CPUVendorID, inHostNamespace)
  179. if err != nil {
  180. klog.V(4).Infof("Cannot gather resctrl metrics: %v", err)
  181. }
  182. versionInfo, err := getVersionInfo()
  183. if err != nil {
  184. return nil, err
  185. }
  186. klog.V(1).Infof("Version: %+v", *versionInfo)
  187. newManager.eventHandler = events.NewEventManager(parseEventsStoragePolicy())
  188. return newManager, nil
  189. }
  190. // A namespaced container name.
  191. type namespacedContainerName struct {
  192. // The namespace of the container. Can be empty for the root namespace.
  193. Namespace string
  194. // The name of the container in this namespace.
  195. Name string
  196. }
  197. type manager struct {
  198. containers map[namespacedContainerName]*containerData
  199. containersLock sync.RWMutex
  200. memoryCache *memory.InMemoryCache
  201. fsInfo fs.FsInfo
  202. sysFs sysfs.SysFs
  203. machineMu sync.RWMutex // protects machineInfo
  204. machineInfo info.MachineInfo
  205. quitChannels []chan error
  206. cadvisorContainer string
  207. inHostNamespace bool
  208. eventHandler events.EventManager
  209. startupTime time.Time
  210. maxHousekeepingInterval time.Duration
  211. allowDynamicHousekeeping bool
  212. includedMetrics container.MetricSet
  213. containerWatchers []watcher.ContainerWatcher
  214. eventsChannel chan watcher.ContainerEvent
  215. collectorHTTPClient *http.Client
  216. nvidiaManager stats.Manager
  217. perfManager stats.Manager
  218. resctrlManager resctrl.Manager
  219. // List of raw container cgroup path prefix whitelist.
  220. rawContainerCgroupPathPrefixWhiteList []string
  221. // List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
  222. containerEnvMetadataWhiteList []string
  223. }
  224. // Start the container manager.
  225. func (m *manager) Start() error {
  226. m.containerWatchers = container.InitializePlugins(m, m.fsInfo, m.includedMetrics)
  227. err := raw.Register(m, m.fsInfo, m.includedMetrics, m.rawContainerCgroupPathPrefixWhiteList)
  228. if err != nil {
  229. klog.Errorf("Registration of the raw container factory failed: %v", err)
  230. }
  231. rawWatcher, err := raw.NewRawContainerWatcher()
  232. if err != nil {
  233. return err
  234. }
  235. m.containerWatchers = append(m.containerWatchers, rawWatcher)
  236. // Watch for OOMs.
  237. err = m.watchForNewOoms()
  238. if err != nil {
  239. klog.Warningf("Could not configure a source for OOM detection, disabling OOM events: %v", err)
  240. }
  241. // If there are no factories, don't start any housekeeping and serve the information we do have.
  242. if !container.HasFactories() {
  243. return nil
  244. }
  245. // Create root and then recover all containers.
  246. err = m.createContainer("/", watcher.Raw)
  247. if err != nil {
  248. return err
  249. }
  250. klog.V(2).Infof("Starting recovery of all containers")
  251. err = m.detectSubcontainers("/")
  252. if err != nil {
  253. return err
  254. }
  255. klog.V(2).Infof("Recovery completed")
  256. // Watch for new container.
  257. quitWatcher := make(chan error)
  258. err = m.watchForNewContainers(quitWatcher)
  259. if err != nil {
  260. return err
  261. }
  262. m.quitChannels = append(m.quitChannels, quitWatcher)
  263. // Look for new containers in the main housekeeping thread.
  264. quitGlobalHousekeeping := make(chan error)
  265. m.quitChannels = append(m.quitChannels, quitGlobalHousekeeping)
  266. go m.globalHousekeeping(quitGlobalHousekeeping)
  267. quitUpdateMachineInfo := make(chan error)
  268. m.quitChannels = append(m.quitChannels, quitUpdateMachineInfo)
  269. go m.updateMachineInfo(quitUpdateMachineInfo)
  270. return nil
  271. }
  272. func (m *manager) Stop() error {
  273. defer m.nvidiaManager.Destroy()
  274. defer m.destroyCollectors()
  275. // Stop and wait on all quit channels.
  276. for i, c := range m.quitChannels {
  277. // Send the exit signal and wait on the thread to exit (by closing the channel).
  278. c <- nil
  279. err := <-c
  280. if err != nil {
  281. // Remove the channels that quit successfully.
  282. m.quitChannels = m.quitChannels[i:]
  283. return err
  284. }
  285. }
  286. m.quitChannels = make([]chan error, 0, 2)
  287. nvm.Finalize()
  288. perf.Finalize()
  289. return nil
  290. }
  291. func (m *manager) destroyCollectors() {
  292. for _, container := range m.containers {
  293. container.perfCollector.Destroy()
  294. container.resctrlCollector.Destroy()
  295. }
  296. }
  297. func (m *manager) updateMachineInfo(quit chan error) {
  298. ticker := time.NewTicker(*updateMachineInfoInterval)
  299. for {
  300. select {
  301. case <-ticker.C:
  302. info, err := machine.Info(m.sysFs, m.fsInfo, m.inHostNamespace)
  303. if err != nil {
  304. klog.Errorf("Could not get machine info: %v", err)
  305. break
  306. }
  307. m.machineMu.Lock()
  308. m.machineInfo = *info
  309. m.machineMu.Unlock()
  310. klog.V(5).Infof("Update machine info: %+v", *info)
  311. case <-quit:
  312. ticker.Stop()
  313. quit <- nil
  314. return
  315. }
  316. }
  317. }
  318. func (m *manager) globalHousekeeping(quit chan error) {
  319. // Long housekeeping is either 100ms or half of the housekeeping interval.
  320. longHousekeeping := 100 * time.Millisecond
  321. if *globalHousekeepingInterval/2 < longHousekeeping {
  322. longHousekeeping = *globalHousekeepingInterval / 2
  323. }
  324. ticker := time.NewTicker(*globalHousekeepingInterval)
  325. for {
  326. select {
  327. case t := <-ticker.C:
  328. start := time.Now()
  329. // Check for new containers.
  330. err := m.detectSubcontainers("/")
  331. if err != nil {
  332. klog.Errorf("Failed to detect containers: %s", err)
  333. }
  334. // Log if housekeeping took too long.
  335. duration := time.Since(start)
  336. if duration >= longHousekeeping {
  337. klog.V(3).Infof("Global Housekeeping(%d) took %s", t.Unix(), duration)
  338. }
  339. case <-quit:
  340. // Quit if asked to do so.
  341. quit <- nil
  342. klog.Infof("Exiting global housekeeping thread")
  343. return
  344. }
  345. }
  346. }
  347. func (m *manager) getContainerData(containerName string) (*containerData, error) {
  348. var cont *containerData
  349. var ok bool
  350. func() {
  351. m.containersLock.RLock()
  352. defer m.containersLock.RUnlock()
  353. // Ensure we have the container.
  354. cont, ok = m.containers[namespacedContainerName{
  355. Name: containerName,
  356. }]
  357. }()
  358. if !ok {
  359. return nil, fmt.Errorf("unknown container %q", containerName)
  360. }
  361. return cont, nil
  362. }
  363. func (m *manager) GetDerivedStats(containerName string, options v2.RequestOptions) (map[string]v2.DerivedStats, error) {
  364. conts, err := m.getRequestedContainers(containerName, options)
  365. if err != nil {
  366. return nil, err
  367. }
  368. var errs partialFailure
  369. stats := make(map[string]v2.DerivedStats)
  370. for name, cont := range conts {
  371. d, err := cont.DerivedStats()
  372. if err != nil {
  373. errs.append(name, "DerivedStats", err)
  374. }
  375. stats[name] = d
  376. }
  377. return stats, errs.OrNil()
  378. }
  379. func (m *manager) GetContainerSpec(containerName string, options v2.RequestOptions) (map[string]v2.ContainerSpec, error) {
  380. conts, err := m.getRequestedContainers(containerName, options)
  381. if err != nil {
  382. return nil, err
  383. }
  384. var errs partialFailure
  385. specs := make(map[string]v2.ContainerSpec)
  386. for name, cont := range conts {
  387. cinfo, err := cont.GetInfo(false)
  388. if err != nil {
  389. errs.append(name, "GetInfo", err)
  390. }
  391. spec := m.getV2Spec(cinfo)
  392. specs[name] = spec
  393. }
  394. return specs, errs.OrNil()
  395. }
  396. // Get V2 container spec from v1 container info.
  397. func (m *manager) getV2Spec(cinfo *containerInfo) v2.ContainerSpec {
  398. spec := m.getAdjustedSpec(cinfo)
  399. return v2.ContainerSpecFromV1(&spec, cinfo.Aliases, cinfo.Namespace)
  400. }
  401. func (m *manager) getAdjustedSpec(cinfo *containerInfo) info.ContainerSpec {
  402. spec := cinfo.Spec
  403. // Set default value to an actual value
  404. if spec.HasMemory {
  405. // Memory.Limit is 0 means there's no limit
  406. if spec.Memory.Limit == 0 {
  407. m.machineMu.RLock()
  408. spec.Memory.Limit = uint64(m.machineInfo.MemoryCapacity)
  409. m.machineMu.RUnlock()
  410. }
  411. }
  412. return spec
  413. }
  414. func (m *manager) GetContainerInfo(containerName string, query *info.ContainerInfoRequest) (*info.ContainerInfo, error) {
  415. cont, err := m.getContainerData(containerName)
  416. if err != nil {
  417. return nil, err
  418. }
  419. return m.containerDataToContainerInfo(cont, query)
  420. }
  421. func (m *manager) GetContainerInfoV2(containerName string, options v2.RequestOptions) (map[string]v2.ContainerInfo, error) {
  422. containers, err := m.getRequestedContainers(containerName, options)
  423. if err != nil {
  424. return nil, err
  425. }
  426. var errs partialFailure
  427. var nilTime time.Time // Ignored.
  428. infos := make(map[string]v2.ContainerInfo, len(containers))
  429. for name, container := range containers {
  430. result := v2.ContainerInfo{}
  431. cinfo, err := container.GetInfo(false)
  432. if err != nil {
  433. errs.append(name, "GetInfo", err)
  434. infos[name] = result
  435. continue
  436. }
  437. result.Spec = m.getV2Spec(cinfo)
  438. stats, err := m.memoryCache.RecentStats(name, nilTime, nilTime, options.Count)
  439. if err != nil {
  440. errs.append(name, "RecentStats", err)
  441. infos[name] = result
  442. continue
  443. }
  444. result.Stats = v2.ContainerStatsFromV1(containerName, &cinfo.Spec, stats)
  445. infos[name] = result
  446. }
  447. return infos, errs.OrNil()
  448. }
  449. func (m *manager) containerDataToContainerInfo(cont *containerData, query *info.ContainerInfoRequest) (*info.ContainerInfo, error) {
  450. // Get the info from the container.
  451. cinfo, err := cont.GetInfo(true)
  452. if err != nil {
  453. return nil, err
  454. }
  455. stats, err := m.memoryCache.RecentStats(cinfo.Name, query.Start, query.End, query.NumStats)
  456. if err != nil {
  457. return nil, err
  458. }
  459. // Make a copy of the info for the user.
  460. ret := &info.ContainerInfo{
  461. ContainerReference: cinfo.ContainerReference,
  462. Subcontainers: cinfo.Subcontainers,
  463. Spec: m.getAdjustedSpec(cinfo),
  464. Stats: stats,
  465. }
  466. return ret, nil
  467. }
  468. func (m *manager) getContainer(containerName string) (*containerData, error) {
  469. m.containersLock.RLock()
  470. defer m.containersLock.RUnlock()
  471. cont, ok := m.containers[namespacedContainerName{Name: containerName}]
  472. if !ok {
  473. return nil, fmt.Errorf("unknown container %q", containerName)
  474. }
  475. return cont, nil
  476. }
  477. func (m *manager) getSubcontainers(containerName string) map[string]*containerData {
  478. m.containersLock.RLock()
  479. defer m.containersLock.RUnlock()
  480. containersMap := make(map[string]*containerData, len(m.containers))
  481. // Get all the unique subcontainers of the specified container
  482. matchedName := path.Join(containerName, "/")
  483. for i := range m.containers {
  484. if m.containers[i] == nil {
  485. continue
  486. }
  487. name := m.containers[i].info.Name
  488. if name == containerName || strings.HasPrefix(name, matchedName) {
  489. containersMap[m.containers[i].info.Name] = m.containers[i]
  490. }
  491. }
  492. return containersMap
  493. }
  494. func (m *manager) SubcontainersInfo(containerName string, query *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
  495. containersMap := m.getSubcontainers(containerName)
  496. containers := make([]*containerData, 0, len(containersMap))
  497. for _, cont := range containersMap {
  498. containers = append(containers, cont)
  499. }
  500. return m.containerDataSliceToContainerInfoSlice(containers, query)
  501. }
  502. func (m *manager) getAllDockerContainers() map[string]*containerData {
  503. m.containersLock.RLock()
  504. defer m.containersLock.RUnlock()
  505. containers := make(map[string]*containerData, len(m.containers))
  506. // Get containers in the Docker namespace.
  507. for name, cont := range m.containers {
  508. if name.Namespace == DockerNamespace {
  509. containers[cont.info.Name] = cont
  510. }
  511. }
  512. return containers
  513. }
  514. func (m *manager) AllDockerContainers(query *info.ContainerInfoRequest) (map[string]info.ContainerInfo, error) {
  515. containers := m.getAllDockerContainers()
  516. output := make(map[string]info.ContainerInfo, len(containers))
  517. for name, cont := range containers {
  518. inf, err := m.containerDataToContainerInfo(cont, query)
  519. if err != nil {
  520. // Ignore the error because of race condition and return best-effort result.
  521. if err == memory.ErrDataNotFound {
  522. klog.Warningf("Error getting data for container %s because of race condition", name)
  523. continue
  524. }
  525. return nil, err
  526. }
  527. output[name] = *inf
  528. }
  529. return output, nil
  530. }
  531. func (m *manager) getDockerContainer(containerName string) (*containerData, error) {
  532. m.containersLock.RLock()
  533. defer m.containersLock.RUnlock()
  534. // Check for the container in the Docker container namespace.
  535. cont, ok := m.containers[namespacedContainerName{
  536. Namespace: DockerNamespace,
  537. Name: containerName,
  538. }]
  539. // Look for container by short prefix name if no exact match found.
  540. if !ok {
  541. for contName, c := range m.containers {
  542. if contName.Namespace == DockerNamespace && strings.HasPrefix(contName.Name, containerName) {
  543. if cont == nil {
  544. cont = c
  545. } else {
  546. return nil, fmt.Errorf("unable to find container. Container %q is not unique", containerName)
  547. }
  548. }
  549. }
  550. if cont == nil {
  551. return nil, fmt.Errorf("unable to find Docker container %q", containerName)
  552. }
  553. }
  554. return cont, nil
  555. }
  556. func (m *manager) DockerContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error) {
  557. container, err := m.getDockerContainer(containerName)
  558. if err != nil {
  559. return info.ContainerInfo{}, err
  560. }
  561. inf, err := m.containerDataToContainerInfo(container, query)
  562. if err != nil {
  563. return info.ContainerInfo{}, err
  564. }
  565. return *inf, nil
  566. }
  567. func (m *manager) containerDataSliceToContainerInfoSlice(containers []*containerData, query *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
  568. if len(containers) == 0 {
  569. return nil, fmt.Errorf("no containers found")
  570. }
  571. // Get the info for each container.
  572. output := make([]*info.ContainerInfo, 0, len(containers))
  573. for i := range containers {
  574. cinfo, err := m.containerDataToContainerInfo(containers[i], query)
  575. if err != nil {
  576. // Skip containers with errors, we try to degrade gracefully.
  577. klog.V(4).Infof("convert container data to container info failed with error %s", err.Error())
  578. continue
  579. }
  580. output = append(output, cinfo)
  581. }
  582. return output, nil
  583. }
  584. func (m *manager) GetRequestedContainersInfo(containerName string, options v2.RequestOptions) (map[string]*info.ContainerInfo, error) {
  585. containers, err := m.getRequestedContainers(containerName, options)
  586. if err != nil {
  587. return nil, err
  588. }
  589. var errs partialFailure
  590. containersMap := make(map[string]*info.ContainerInfo)
  591. query := info.ContainerInfoRequest{
  592. NumStats: options.Count,
  593. }
  594. for name, data := range containers {
  595. info, err := m.containerDataToContainerInfo(data, &query)
  596. if err != nil {
  597. if err == memory.ErrDataNotFound {
  598. klog.Warningf("Error getting data for container %s because of race condition", name)
  599. continue
  600. }
  601. errs.append(name, "containerDataToContainerInfo", err)
  602. }
  603. containersMap[name] = info
  604. }
  605. return containersMap, errs.OrNil()
  606. }
  607. func (m *manager) getRequestedContainers(containerName string, options v2.RequestOptions) (map[string]*containerData, error) {
  608. containersMap := make(map[string]*containerData)
  609. switch options.IdType {
  610. case v2.TypeName:
  611. if !options.Recursive {
  612. cont, err := m.getContainer(containerName)
  613. if err != nil {
  614. return containersMap, err
  615. }
  616. containersMap[cont.info.Name] = cont
  617. } else {
  618. containersMap = m.getSubcontainers(containerName)
  619. if len(containersMap) == 0 {
  620. return containersMap, fmt.Errorf("unknown container: %q", containerName)
  621. }
  622. }
  623. case v2.TypeDocker:
  624. if !options.Recursive {
  625. containerName = strings.TrimPrefix(containerName, "/")
  626. cont, err := m.getDockerContainer(containerName)
  627. if err != nil {
  628. return containersMap, err
  629. }
  630. containersMap[cont.info.Name] = cont
  631. } else {
  632. if containerName != "/" {
  633. return containersMap, fmt.Errorf("invalid request for docker container %q with subcontainers", containerName)
  634. }
  635. containersMap = m.getAllDockerContainers()
  636. }
  637. default:
  638. return containersMap, fmt.Errorf("invalid request type %q", options.IdType)
  639. }
  640. if options.MaxAge != nil {
  641. // update stats for all containers in containersMap
  642. var waitGroup sync.WaitGroup
  643. waitGroup.Add(len(containersMap))
  644. for _, container := range containersMap {
  645. go func(cont *containerData) {
  646. cont.OnDemandHousekeeping(*options.MaxAge)
  647. waitGroup.Done()
  648. }(container)
  649. }
  650. waitGroup.Wait()
  651. }
  652. return containersMap, nil
  653. }
  654. func (m *manager) GetDirFsInfo(dir string) (v2.FsInfo, error) {
  655. device, err := m.fsInfo.GetDirFsDevice(dir)
  656. if err != nil {
  657. return v2.FsInfo{}, fmt.Errorf("failed to get device for dir %q: %v", dir, err)
  658. }
  659. return m.getFsInfoByDeviceName(device.Device)
  660. }
  661. func (m *manager) GetFsInfoByFsUUID(uuid string) (v2.FsInfo, error) {
  662. device, err := m.fsInfo.GetDeviceInfoByFsUUID(uuid)
  663. if err != nil {
  664. return v2.FsInfo{}, err
  665. }
  666. return m.getFsInfoByDeviceName(device.Device)
  667. }
  668. func (m *manager) GetFsInfo(label string) ([]v2.FsInfo, error) {
  669. var empty time.Time
  670. // Get latest data from filesystems hanging off root container.
  671. stats, err := m.memoryCache.RecentStats("/", empty, empty, 1)
  672. if err != nil {
  673. return nil, err
  674. }
  675. dev := ""
  676. if len(label) != 0 {
  677. dev, err = m.fsInfo.GetDeviceForLabel(label)
  678. if err != nil {
  679. return nil, err
  680. }
  681. }
  682. fsInfo := []v2.FsInfo{}
  683. for i := range stats[0].Filesystem {
  684. fs := stats[0].Filesystem[i]
  685. if len(label) != 0 && fs.Device != dev {
  686. continue
  687. }
  688. mountpoint, err := m.fsInfo.GetMountpointForDevice(fs.Device)
  689. if err != nil {
  690. return nil, err
  691. }
  692. labels, err := m.fsInfo.GetLabelsForDevice(fs.Device)
  693. if err != nil {
  694. return nil, err
  695. }
  696. fi := v2.FsInfo{
  697. Timestamp: stats[0].Timestamp,
  698. Device: fs.Device,
  699. Mountpoint: mountpoint,
  700. Capacity: fs.Limit,
  701. Usage: fs.Usage,
  702. Available: fs.Available,
  703. Labels: labels,
  704. }
  705. if fs.HasInodes {
  706. fi.Inodes = &fs.Inodes
  707. fi.InodesFree = &fs.InodesFree
  708. }
  709. fsInfo = append(fsInfo, fi)
  710. }
  711. return fsInfo, nil
  712. }
  713. func (m *manager) GetMachineInfo() (*info.MachineInfo, error) {
  714. m.machineMu.RLock()
  715. defer m.machineMu.RUnlock()
  716. return m.machineInfo.Clone(), nil
  717. }
  718. func (m *manager) GetVersionInfo() (*info.VersionInfo, error) {
  719. // TODO: Consider caching this and periodically updating. The VersionInfo may change if
  720. // the docker daemon is started after the cAdvisor client is created. Caching the value
  721. // would be helpful so we would be able to return the last known docker version if
  722. // docker was down at the time of a query.
  723. return getVersionInfo()
  724. }
  725. func (m *manager) Exists(containerName string) bool {
  726. m.containersLock.RLock()
  727. defer m.containersLock.RUnlock()
  728. namespacedName := namespacedContainerName{
  729. Name: containerName,
  730. }
  731. _, ok := m.containers[namespacedName]
  732. return ok
  733. }
  734. func (m *manager) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) {
  735. // override recursive. Only support single container listing.
  736. options.Recursive = false
  737. // override MaxAge. ProcessList does not require updated stats.
  738. options.MaxAge = nil
  739. conts, err := m.getRequestedContainers(containerName, options)
  740. if err != nil {
  741. return nil, err
  742. }
  743. if len(conts) != 1 {
  744. return nil, fmt.Errorf("Expected the request to match only one container")
  745. }
  746. // TODO(rjnagal): handle count? Only if we can do count by type (eg. top 5 cpu users)
  747. ps := []v2.ProcessInfo{}
  748. for _, cont := range conts {
  749. ps, err = cont.GetProcessList(m.cadvisorContainer, m.inHostNamespace)
  750. if err != nil {
  751. return nil, err
  752. }
  753. }
  754. return ps, nil
  755. }
  756. func (m *manager) registerCollectors(collectorConfigs map[string]string, cont *containerData) error {
  757. for k, v := range collectorConfigs {
  758. configFile, err := cont.ReadFile(v, m.inHostNamespace)
  759. if err != nil {
  760. return fmt.Errorf("failed to read config file %q for config %q, container %q: %v", k, v, cont.info.Name, err)
  761. }
  762. klog.V(4).Infof("Got config from %q: %q", v, configFile)
  763. if strings.HasPrefix(k, "prometheus") || strings.HasPrefix(k, "Prometheus") {
  764. newCollector, err := collector.NewPrometheusCollector(k, configFile, *applicationMetricsCountLimit, cont.handler, m.collectorHTTPClient)
  765. if err != nil {
  766. return fmt.Errorf("failed to create collector for container %q, config %q: %v", cont.info.Name, k, err)
  767. }
  768. err = cont.collectorManager.RegisterCollector(newCollector)
  769. if err != nil {
  770. return fmt.Errorf("failed to register collector for container %q, config %q: %v", cont.info.Name, k, err)
  771. }
  772. } else {
  773. newCollector, err := collector.NewCollector(k, configFile, *applicationMetricsCountLimit, cont.handler, m.collectorHTTPClient)
  774. if err != nil {
  775. return fmt.Errorf("failed to create collector for container %q, config %q: %v", cont.info.Name, k, err)
  776. }
  777. err = cont.collectorManager.RegisterCollector(newCollector)
  778. if err != nil {
  779. return fmt.Errorf("failed to register collector for container %q, config %q: %v", cont.info.Name, k, err)
  780. }
  781. }
  782. }
  783. return nil
  784. }
  785. // Create a container.
  786. func (m *manager) createContainer(containerName string, watchSource watcher.ContainerWatchSource) error {
  787. m.containersLock.Lock()
  788. defer m.containersLock.Unlock()
  789. return m.createContainerLocked(containerName, watchSource)
  790. }
  791. func (m *manager) createContainerLocked(containerName string, watchSource watcher.ContainerWatchSource) error {
  792. namespacedName := namespacedContainerName{
  793. Name: containerName,
  794. }
  795. // Check that the container didn't already exist.
  796. if _, ok := m.containers[namespacedName]; ok {
  797. return nil
  798. }
  799. handler, accept, err := container.NewContainerHandler(containerName, watchSource, m.containerEnvMetadataWhiteList, m.inHostNamespace)
  800. if err != nil {
  801. return err
  802. }
  803. if !accept {
  804. // ignoring this container.
  805. klog.V(4).Infof("ignoring container %q", containerName)
  806. return nil
  807. }
  808. collectorManager, err := collector.NewCollectorManager()
  809. if err != nil {
  810. return err
  811. }
  812. logUsage := *logCadvisorUsage && containerName == m.cadvisorContainer
  813. cont, err := newContainerData(containerName, m.memoryCache, handler, logUsage, collectorManager, m.maxHousekeepingInterval, m.allowDynamicHousekeeping, clock.RealClock{})
  814. if err != nil {
  815. return err
  816. }
  817. if !cgroups.IsCgroup2UnifiedMode() {
  818. devicesCgroupPath, err := handler.GetCgroupPath("devices")
  819. if err != nil {
  820. klog.Warningf("Error getting devices cgroup path: %v", err)
  821. } else {
  822. cont.nvidiaCollector, err = m.nvidiaManager.GetCollector(devicesCgroupPath)
  823. if err != nil {
  824. klog.V(4).Infof("GPU metrics may be unavailable/incomplete for container %s: %s", cont.info.Name, err)
  825. }
  826. }
  827. }
  828. if m.includedMetrics.Has(container.PerfMetrics) {
  829. perfCgroupPath, err := handler.GetCgroupPath("perf_event")
  830. if err != nil {
  831. klog.Warningf("Error getting perf_event cgroup path: %q", err)
  832. } else {
  833. cont.perfCollector, err = m.perfManager.GetCollector(perfCgroupPath)
  834. if err != nil {
  835. klog.Errorf("Perf event metrics will not be available for container %q: %v", containerName, err)
  836. }
  837. }
  838. }
  839. if m.includedMetrics.Has(container.ResctrlMetrics) {
  840. cont.resctrlCollector, err = m.resctrlManager.GetCollector(containerName, func() ([]string, error) {
  841. return cont.getContainerPids(m.inHostNamespace)
  842. }, len(m.machineInfo.Topology))
  843. if err != nil {
  844. klog.V(4).Infof("resctrl metrics will not be available for container %s: %s", cont.info.Name, err)
  845. }
  846. }
  847. // Add collectors
  848. labels := handler.GetContainerLabels()
  849. collectorConfigs := collector.GetCollectorConfigs(labels)
  850. err = m.registerCollectors(collectorConfigs, cont)
  851. if err != nil {
  852. klog.Warningf("Failed to register collectors for %q: %v", containerName, err)
  853. }
  854. // Add the container name and all its aliases. The aliases must be within the namespace of the factory.
  855. m.containers[namespacedName] = cont
  856. for _, alias := range cont.info.Aliases {
  857. m.containers[namespacedContainerName{
  858. Namespace: cont.info.Namespace,
  859. Name: alias,
  860. }] = cont
  861. }
  862. klog.V(3).Infof("Added container: %q (aliases: %v, namespace: %q)", containerName, cont.info.Aliases, cont.info.Namespace)
  863. contSpec, err := cont.handler.GetSpec()
  864. if err != nil {
  865. return err
  866. }
  867. contRef, err := cont.handler.ContainerReference()
  868. if err != nil {
  869. return err
  870. }
  871. newEvent := &info.Event{
  872. ContainerName: contRef.Name,
  873. Timestamp: contSpec.CreationTime,
  874. EventType: info.EventContainerCreation,
  875. }
  876. err = m.eventHandler.AddEvent(newEvent)
  877. if err != nil {
  878. return err
  879. }
  880. // Start the container's housekeeping.
  881. return cont.Start()
  882. }
  883. func (m *manager) destroyContainer(containerName string) error {
  884. m.containersLock.Lock()
  885. defer m.containersLock.Unlock()
  886. return m.destroyContainerLocked(containerName)
  887. }
  888. func (m *manager) destroyContainerLocked(containerName string) error {
  889. namespacedName := namespacedContainerName{
  890. Name: containerName,
  891. }
  892. cont, ok := m.containers[namespacedName]
  893. if !ok {
  894. // Already destroyed, done.
  895. return nil
  896. }
  897. // Tell the container to stop.
  898. err := cont.Stop()
  899. if err != nil {
  900. return err
  901. }
  902. // Remove the container from our records (and all its aliases).
  903. delete(m.containers, namespacedName)
  904. for _, alias := range cont.info.Aliases {
  905. delete(m.containers, namespacedContainerName{
  906. Namespace: cont.info.Namespace,
  907. Name: alias,
  908. })
  909. }
  910. klog.V(3).Infof("Destroyed container: %q (aliases: %v, namespace: %q)", containerName, cont.info.Aliases, cont.info.Namespace)
  911. contRef, err := cont.handler.ContainerReference()
  912. if err != nil {
  913. return err
  914. }
  915. newEvent := &info.Event{
  916. ContainerName: contRef.Name,
  917. Timestamp: time.Now(),
  918. EventType: info.EventContainerDeletion,
  919. }
  920. err = m.eventHandler.AddEvent(newEvent)
  921. if err != nil {
  922. return err
  923. }
  924. return nil
  925. }
  926. // Detect all containers that have been added or deleted from the specified container.
  927. func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) {
  928. // Get all subcontainers recursively.
  929. m.containersLock.RLock()
  930. cont, ok := m.containers[namespacedContainerName{
  931. Name: containerName,
  932. }]
  933. m.containersLock.RUnlock()
  934. if !ok {
  935. return nil, nil, fmt.Errorf("failed to find container %q while checking for new containers", containerName)
  936. }
  937. allContainers, err := cont.handler.ListContainers(container.ListRecursive)
  938. if err != nil {
  939. return nil, nil, err
  940. }
  941. allContainers = append(allContainers, info.ContainerReference{Name: containerName})
  942. m.containersLock.RLock()
  943. defer m.containersLock.RUnlock()
  944. // Determine which were added and which were removed.
  945. allContainersSet := make(map[string]*containerData)
  946. for name, d := range m.containers {
  947. // Only add the canonical name.
  948. if d.info.Name == name.Name {
  949. allContainersSet[name.Name] = d
  950. }
  951. }
  952. // Added containers
  953. for _, c := range allContainers {
  954. delete(allContainersSet, c.Name)
  955. _, ok := m.containers[namespacedContainerName{
  956. Name: c.Name,
  957. }]
  958. if !ok {
  959. added = append(added, c)
  960. }
  961. }
  962. // Removed ones are no longer in the container listing.
  963. for _, d := range allContainersSet {
  964. removed = append(removed, d.info.ContainerReference)
  965. }
  966. return
  967. }
  968. // Detect the existing subcontainers and reflect the setup here.
  969. func (m *manager) detectSubcontainers(containerName string) error {
  970. added, removed, err := m.getContainersDiff(containerName)
  971. if err != nil {
  972. return err
  973. }
  974. // Add the new containers.
  975. for _, cont := range added {
  976. err = m.createContainer(cont.Name, watcher.Raw)
  977. if err != nil {
  978. klog.Errorf("Failed to create existing container: %s: %s", cont.Name, err)
  979. }
  980. }
  981. // Remove the old containers.
  982. for _, cont := range removed {
  983. err = m.destroyContainer(cont.Name)
  984. if err != nil {
  985. klog.Errorf("Failed to destroy existing container: %s: %s", cont.Name, err)
  986. }
  987. }
  988. return nil
  989. }
  990. // Watches for new containers started in the system. Runs forever unless there is a setup error.
  991. func (m *manager) watchForNewContainers(quit chan error) error {
  992. watched := make([]watcher.ContainerWatcher, 0)
  993. for _, watcher := range m.containerWatchers {
  994. err := watcher.Start(m.eventsChannel)
  995. if err != nil {
  996. for _, w := range watched {
  997. stopErr := w.Stop()
  998. if stopErr != nil {
  999. klog.Warningf("Failed to stop wacher %v with error: %v", w, stopErr)
  1000. }
  1001. }
  1002. return err
  1003. }
  1004. watched = append(watched, watcher)
  1005. }
  1006. // There is a race between starting the watch and new container creation so we do a detection before we read new containers.
  1007. err := m.detectSubcontainers("/")
  1008. if err != nil {
  1009. return err
  1010. }
  1011. // Listen to events from the container handler.
  1012. go func() {
  1013. for {
  1014. select {
  1015. case event := <-m.eventsChannel:
  1016. switch {
  1017. case event.EventType == watcher.ContainerAdd:
  1018. switch event.WatchSource {
  1019. default:
  1020. err = m.createContainer(event.Name, event.WatchSource)
  1021. }
  1022. case event.EventType == watcher.ContainerDelete:
  1023. err = m.destroyContainer(event.Name)
  1024. }
  1025. if err != nil {
  1026. klog.Warningf("Failed to process watch event %+v: %v", event, err)
  1027. }
  1028. case <-quit:
  1029. var errs partialFailure
  1030. // Stop processing events if asked to quit.
  1031. for i, watcher := range m.containerWatchers {
  1032. err := watcher.Stop()
  1033. if err != nil {
  1034. errs.append(fmt.Sprintf("watcher %d", i), "Stop", err)
  1035. }
  1036. }
  1037. if len(errs) > 0 {
  1038. quit <- errs
  1039. } else {
  1040. quit <- nil
  1041. klog.Infof("Exiting thread watching subcontainers")
  1042. return
  1043. }
  1044. }
  1045. }
  1046. }()
  1047. return nil
  1048. }
  1049. func (m *manager) watchForNewOoms() error {
  1050. klog.V(2).Infof("Started watching for new ooms in manager")
  1051. outStream := make(chan *oomparser.OomInstance, 10)
  1052. oomLog, err := oomparser.New()
  1053. if err != nil {
  1054. return err
  1055. }
  1056. go oomLog.StreamOoms(outStream)
  1057. go func() {
  1058. for oomInstance := range outStream {
  1059. // Surface OOM and OOM kill events.
  1060. newEvent := &info.Event{
  1061. ContainerName: oomInstance.ContainerName,
  1062. Timestamp: oomInstance.TimeOfDeath,
  1063. EventType: info.EventOom,
  1064. }
  1065. err := m.eventHandler.AddEvent(newEvent)
  1066. if err != nil {
  1067. klog.Errorf("failed to add OOM event for %q: %v", oomInstance.ContainerName, err)
  1068. }
  1069. klog.V(3).Infof("Created an OOM event in container %q at %v", oomInstance.ContainerName, oomInstance.TimeOfDeath)
  1070. newEvent = &info.Event{
  1071. ContainerName: oomInstance.VictimContainerName,
  1072. Timestamp: oomInstance.TimeOfDeath,
  1073. EventType: info.EventOomKill,
  1074. EventData: info.EventData{
  1075. OomKill: &info.OomKillEventData{
  1076. Pid: oomInstance.Pid,
  1077. ProcessName: oomInstance.ProcessName,
  1078. },
  1079. },
  1080. }
  1081. err = m.eventHandler.AddEvent(newEvent)
  1082. if err != nil {
  1083. klog.Errorf("failed to add OOM kill event for %q: %v", oomInstance.ContainerName, err)
  1084. }
  1085. // Count OOM events for later collection by prometheus
  1086. request := v2.RequestOptions{
  1087. IdType: v2.TypeName,
  1088. Count: 1,
  1089. }
  1090. conts, err := m.getRequestedContainers(oomInstance.ContainerName, request)
  1091. if err != nil {
  1092. klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err)
  1093. continue
  1094. }
  1095. if len(conts) != 1 {
  1096. klog.V(2).Info("Expected the request to match only one container")
  1097. continue
  1098. }
  1099. for _, cont := range conts {
  1100. atomic.AddUint64(&cont.oomEvents, 1)
  1101. }
  1102. }
  1103. }()
  1104. return nil
  1105. }
  1106. // can be called by the api which will take events returned on the channel
  1107. func (m *manager) WatchForEvents(request *events.Request) (*events.EventChannel, error) {
  1108. return m.eventHandler.WatchEvents(request)
  1109. }
  1110. // can be called by the api which will return all events satisfying the request
  1111. func (m *manager) GetPastEvents(request *events.Request) ([]*info.Event, error) {
  1112. return m.eventHandler.GetEvents(request)
  1113. }
  1114. // called by the api when a client is no longer listening to the channel
  1115. func (m *manager) CloseEventChannel(watchID int) {
  1116. m.eventHandler.StopWatch(watchID)
  1117. }
  1118. // Parses the events StoragePolicy from the flags.
  1119. func parseEventsStoragePolicy() events.StoragePolicy {
  1120. policy := events.DefaultStoragePolicy()
  1121. // Parse max age.
  1122. parts := strings.Split(*eventStorageAgeLimit, ",")
  1123. for _, part := range parts {
  1124. items := strings.Split(part, "=")
  1125. if len(items) != 2 {
  1126. klog.Warningf("Unknown event storage policy %q when parsing max age", part)
  1127. continue
  1128. }
  1129. dur, err := time.ParseDuration(items[1])
  1130. if err != nil {
  1131. klog.Warningf("Unable to parse event max age duration %q: %v", items[1], err)
  1132. continue
  1133. }
  1134. if items[0] == "default" {
  1135. policy.DefaultMaxAge = dur
  1136. continue
  1137. }
  1138. policy.PerTypeMaxAge[info.EventType(items[0])] = dur
  1139. }
  1140. // Parse max number.
  1141. parts = strings.Split(*eventStorageEventLimit, ",")
  1142. for _, part := range parts {
  1143. items := strings.Split(part, "=")
  1144. if len(items) != 2 {
  1145. klog.Warningf("Unknown event storage policy %q when parsing max event limit", part)
  1146. continue
  1147. }
  1148. val, err := strconv.Atoi(items[1])
  1149. if err != nil {
  1150. klog.Warningf("Unable to parse integer from %q: %v", items[1], err)
  1151. continue
  1152. }
  1153. if items[0] == "default" {
  1154. policy.DefaultMaxNumEvents = val
  1155. continue
  1156. }
  1157. policy.PerTypeMaxNumEvents[info.EventType(items[0])] = val
  1158. }
  1159. return policy
  1160. }
  1161. func (m *manager) DebugInfo() map[string][]string {
  1162. debugInfo := container.DebugInfo()
  1163. // Get unique containers.
  1164. var conts map[*containerData]struct{}
  1165. func() {
  1166. m.containersLock.RLock()
  1167. defer m.containersLock.RUnlock()
  1168. conts = make(map[*containerData]struct{}, len(m.containers))
  1169. for _, c := range m.containers {
  1170. conts[c] = struct{}{}
  1171. }
  1172. }()
  1173. // List containers.
  1174. lines := make([]string, 0, len(conts))
  1175. for cont := range conts {
  1176. lines = append(lines, cont.info.Name)
  1177. if cont.info.Namespace != "" {
  1178. lines = append(lines, fmt.Sprintf("\tNamespace: %s", cont.info.Namespace))
  1179. }
  1180. if len(cont.info.Aliases) != 0 {
  1181. lines = append(lines, "\tAliases:")
  1182. for _, alias := range cont.info.Aliases {
  1183. lines = append(lines, fmt.Sprintf("\t\t%s", alias))
  1184. }
  1185. }
  1186. }
  1187. debugInfo["Managed containers"] = lines
  1188. return debugInfo
  1189. }
  1190. func (m *manager) getFsInfoByDeviceName(deviceName string) (v2.FsInfo, error) {
  1191. mountPoint, err := m.fsInfo.GetMountpointForDevice(deviceName)
  1192. if err != nil {
  1193. return v2.FsInfo{}, fmt.Errorf("failed to get mount point for device %q: %v", deviceName, err)
  1194. }
  1195. infos, err := m.GetFsInfo("")
  1196. if err != nil {
  1197. return v2.FsInfo{}, err
  1198. }
  1199. for _, info := range infos {
  1200. if info.Mountpoint == mountPoint {
  1201. return info, nil
  1202. }
  1203. }
  1204. return v2.FsInfo{}, fmt.Errorf("cannot find filesystem info for device %q", deviceName)
  1205. }
  1206. func getVersionInfo() (*info.VersionInfo, error) {
  1207. kernelVersion := machine.KernelVersion()
  1208. osVersion := machine.ContainerOsVersion()
  1209. return &info.VersionInfo{
  1210. KernelVersion: kernelVersion,
  1211. ContainerOsVersion: osVersion,
  1212. CadvisorVersion: version.Info["version"],
  1213. CadvisorRevision: version.Info["revision"],
  1214. }, nil
  1215. }
  1216. // Helper for accumulating partial failures.
  1217. type partialFailure []string
  1218. func (f *partialFailure) append(id, operation string, err error) {
  1219. *f = append(*f, fmt.Sprintf("[%q: %s: %s]", id, operation, err))
  1220. }
  1221. func (f partialFailure) Error() string {
  1222. return fmt.Sprintf("partial failures: %s", strings.Join(f, ", "))
  1223. }
  1224. func (f partialFailure) OrNil() error {
  1225. if len(f) == 0 {
  1226. return nil
  1227. }
  1228. return f
  1229. }