container_metrics.go 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package hostmetrics
  15. import (
  16. "fmt"
  17. "sort"
  18. "strconv"
  19. "strings"
  20. "time"
  21. "yunion.io/x/log"
  22. apis "yunion.io/x/onecloud/pkg/apis/compute"
  23. "yunion.io/x/onecloud/pkg/hostman/guestman"
  24. "yunion.io/x/onecloud/pkg/hostman/options"
  25. "yunion.io/x/onecloud/pkg/util/pod/stats"
  26. )
  27. const (
  28. // Cumulative cpu time consumed by the container in core-seconds
  29. CPU_USAGE_SECONDS_TOTAL = "usage_seconds_total"
  30. // cpu usage rate
  31. CPU_USAGE_RATE = "usage_rate"
  32. // current working set of memory size in bytes
  33. MEMORY_WORKING_SET_BYTES = "working_set_bytes"
  34. MEMORY_USAGE_BYTES = "usage_bytes"
  35. // memory usage rate
  36. MEMORY_USAGE_RATE = "usage_rate"
  37. MEMORY_WORKING_SET_RATE = "working_set_rate"
  38. VOLUME_TOTAL = "total"
  39. VOLUME_FREE = "free"
  40. VOLUME_USED = "used"
  41. VOLUME_USED_PERCENT = "used_percent"
  42. VOLUME_INODES_TOTAL = "inodes_total"
  43. VOLUME_INODES_FREE = "inodes_free"
  44. VOLUME_INODES_USED = "inodes_used"
  45. VOLUME_INODES_USED_PERCENT = "inodes_used_percent"
  46. PROCESS_COUNT = "process_count"
  47. FD_COUNT = "fd_count"
  48. SOCKET_COUNT = "socket_count"
  49. THREADS_CURRENT = "threads_current"
  50. THREADS_MAX = "threads_max"
  51. NVIDIA_GPU_MEMORY_TOTAL = "memory_total"
  52. NVIDIA_GPU_INDEX = "index"
  53. NVIDIA_GPU_PHYSICAL_INDEX = "physical_index"
  54. NVIDIA_GPU_FRAME_BUFFER = "frame_buffer"
  55. NVIDIA_GPU_CCPM = "ccpm"
  56. NVIDIA_GPU_SM = "sm"
  57. NVIDIA_GPU_MEM_UTIL = "mem_util"
  58. NVIDIA_GPU_ENC = "enc"
  59. NVIDIA_GPU_DEC = "dec"
  60. NVIDIA_GPU_JPG = "jpg"
  61. NVIDIA_GPU_OFA = "ofa"
  62. VASTAITECH_GPU_DEV_ID = "dev_id"
  63. VASTAITECH_GPU_ENC = "enc"
  64. VASTAITECH_GPU_DEC = "dec"
  65. VASTAITECH_GPU_GFX = "gfx"
  66. VASTAITECH_GPU_MEM = "mem"
  67. VASTAITECH_GPU_MEM_UTIL = "mem_util"
  68. CPH_AMD_GPU_DEV_ID = "dev_id"
  69. CPH_AMD_GPU_MEM = "mem"
  70. CPH_AMD_GPU_MEM_UTIL = "mem_util"
  71. )
  72. type CadvisorProcessMetric struct {
  73. // Number of processes
  74. ProcessCount uint64 `json:"process_count"`
  75. // Number of open file descriptors
  76. FdCount uint64 `json:"fd_count,omitempty"`
  77. // Number of sockets
  78. SocketCount uint64 `json:"socket_count"`
  79. // Number of threads currently in container
  80. ThreadsCurrent uint64 `json:"threads_current,omitempty"`
  81. // Maximum number of threads allowed in container
  82. ThreadsMax uint64 `json:"threads_max,omitempty"`
  83. }
  84. func (m CadvisorProcessMetric) ToMap() map[string]interface{} {
  85. return map[string]interface{}{
  86. PROCESS_COUNT: m.ProcessCount,
  87. FD_COUNT: m.FdCount,
  88. SOCKET_COUNT: m.SocketCount,
  89. THREADS_CURRENT: m.ThreadsCurrent,
  90. THREADS_MAX: m.ThreadsMax,
  91. }
  92. }
  93. type PodMetrics struct {
  94. PodCpu *PodCpuMetric `json:"pod_cpu"`
  95. PodMemory *PodMemoryMetric `json:"pod_memory"`
  96. PodProcess *PodProcessMetric `json:"pod_process"`
  97. PodVolumes []*PodVolumeMetric `json:"pod_volume"`
  98. PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"`
  99. PodNvidiaGpu []*PodNvidiaGpuMetrics `json:"pod_nvidia_gpu"`
  100. PodVastaitechGpu []*PodVastaitechGpuMetrics `json:"pod_vastaitech_gpu"`
  101. PodCphAmdGpu []*PodCphAmdGpuMetrics `json:"pod_cph_amd_gpu"`
  102. Containers []*ContainerMetrics `json:"containers"`
  103. }
  104. type PodMetricMeta struct {
  105. Time time.Time
  106. }
  107. func NewPodMetricMeta(time time.Time) PodMetricMeta {
  108. return PodMetricMeta{Time: time}
  109. }
  110. func (m PodMetricMeta) GetTag() map[string]string {
  111. return nil
  112. }
  113. type PodCphAmdGpuMetrics struct {
  114. PodMetricMeta
  115. DevId string
  116. Mem float64 // MB
  117. MemUtil float64
  118. }
  119. func (m PodCphAmdGpuMetrics) GetName() string {
  120. return "pod_cph_amd_gpu"
  121. }
  122. func (m PodCphAmdGpuMetrics) GetUniformName() string {
  123. return "pod_gpu"
  124. }
  125. func (m PodCphAmdGpuMetrics) GetTag() map[string]string {
  126. return map[string]string{
  127. "dev_id": m.DevId,
  128. "dev_type": apis.CONTAINER_DEV_CPH_AMD_GPU,
  129. }
  130. }
  131. func (m PodCphAmdGpuMetrics) ToMap() map[string]interface{} {
  132. ret := map[string]interface{}{
  133. CPH_AMD_GPU_DEV_ID: m.DevId,
  134. CPH_AMD_GPU_MEM: m.Mem,
  135. CPH_AMD_GPU_MEM_UTIL: m.MemUtil,
  136. }
  137. return ret
  138. }
  139. type PodVastaitechGpuMetrics struct {
  140. PodMetricMeta
  141. PciAddr string
  142. DevId string
  143. Mem float64 // MB
  144. MemUtil float64
  145. Gfx float64
  146. DecUtil float64
  147. EncUtil float64
  148. }
  149. func (m PodVastaitechGpuMetrics) GetName() string {
  150. return "pod_vastaitech_gpu"
  151. }
  152. func (m PodVastaitechGpuMetrics) GetUniformName() string {
  153. return "pod_gpu"
  154. }
  155. func (m PodVastaitechGpuMetrics) GetTag() map[string]string {
  156. return map[string]string{
  157. "dev_id": m.DevId,
  158. "dev_type": apis.CONTAINER_DEV_VASTAITECH_GPU,
  159. }
  160. }
  161. func (m PodVastaitechGpuMetrics) ToMap() map[string]interface{} {
  162. ret := map[string]interface{}{
  163. VASTAITECH_GPU_DEC: m.DecUtil,
  164. VASTAITECH_GPU_DEV_ID: m.DevId,
  165. VASTAITECH_GPU_ENC: m.EncUtil,
  166. VASTAITECH_GPU_GFX: m.Gfx,
  167. VASTAITECH_GPU_MEM: m.Mem,
  168. VASTAITECH_GPU_MEM_UTIL: m.MemUtil,
  169. }
  170. return ret
  171. }
  172. type PodNvidiaGpuMetrics struct {
  173. PodMetricMeta
  174. Index int
  175. PhysicalIndex int
  176. MemTotal int
  177. Framebuffer int // Framebuffer Memory Usage
  178. Ccpm int // Current CUDA Contexts Per Measurement
  179. SmUtil float64 // Streaming Multiprocessor Utilization
  180. Mem int // Mem Usage
  181. MemUtil float64 // Memory Utilization
  182. EncUtil float64 // Encoder Utilization
  183. DecUtil float64 // Decoder Utilization
  184. JpgUtil float64 // JPEG Decoder Utilization
  185. OfaUtil float64 // Other Feature Utilization
  186. }
  187. func (m PodNvidiaGpuMetrics) GetName() string {
  188. return "pod_nvidia_gpu"
  189. }
  190. func (m PodNvidiaGpuMetrics) GetUniformName() string {
  191. return "pod_gpu"
  192. }
  193. func (m PodNvidiaGpuMetrics) GetTag() map[string]string {
  194. devType := apis.CONTAINER_DEV_NVIDIA_GPU
  195. if options.HostOptions.EnableCudaMPS {
  196. devType = apis.CONTAINER_DEV_NVIDIA_MPS
  197. }
  198. return map[string]string{
  199. "index": strconv.Itoa(m.Index),
  200. "physical_index": strconv.Itoa(m.PhysicalIndex),
  201. "dev_type": devType,
  202. }
  203. }
  204. func (m PodNvidiaGpuMetrics) ToMap() map[string]interface{} {
  205. ret := map[string]interface{}{
  206. NVIDIA_GPU_MEMORY_TOTAL: m.MemTotal,
  207. NVIDIA_GPU_INDEX: m.Index,
  208. NVIDIA_GPU_PHYSICAL_INDEX: m.PhysicalIndex,
  209. NVIDIA_GPU_FRAME_BUFFER: m.Framebuffer,
  210. NVIDIA_GPU_CCPM: m.Ccpm,
  211. NVIDIA_GPU_SM: m.SmUtil,
  212. NVIDIA_GPU_MEM_UTIL: m.MemUtil,
  213. NVIDIA_GPU_ENC: m.EncUtil,
  214. NVIDIA_GPU_DEC: m.DecUtil,
  215. NVIDIA_GPU_JPG: m.JpgUtil,
  216. NVIDIA_GPU_OFA: m.OfaUtil,
  217. }
  218. return ret
  219. }
  220. type PodCpuMetric struct {
  221. PodMetricMeta
  222. CpuUsageSecondsTotal float64 `json:"cpu_usage_seconds_total"`
  223. CpuUsageRate *float64 `json:"cpu_usage_rate"`
  224. }
  225. func (m PodCpuMetric) GetName() string {
  226. return "pod_cpu"
  227. }
  228. func (m PodCpuMetric) ToMap() map[string]interface{} {
  229. ret := map[string]interface{}{
  230. CPU_USAGE_SECONDS_TOTAL: m.CpuUsageSecondsTotal,
  231. }
  232. if m.CpuUsageRate != nil {
  233. ret[CPU_USAGE_RATE] = *m.CpuUsageRate
  234. }
  235. return ret
  236. }
  237. type PodMemoryMetric struct {
  238. PodMetricMeta
  239. MemoryWorkingSetBytes float64 `json:"memory_working_set_bytes"`
  240. MemoryWorkingSetRate float64 `json:"memory_working_set_rate"`
  241. MemoryUsageBytes float64 `json:"memory_usage_bytes"`
  242. MemoryUsageRate float64 `json:"memory_usage_rate"`
  243. }
  244. func (m PodMemoryMetric) GetName() string {
  245. return "pod_mem"
  246. }
  247. func (m PodMemoryMetric) ToMap() map[string]interface{} {
  248. return map[string]interface{}{
  249. MEMORY_WORKING_SET_BYTES: m.MemoryWorkingSetBytes,
  250. MEMORY_USAGE_BYTES: m.MemoryUsageBytes,
  251. MEMORY_USAGE_RATE: m.MemoryUsageRate,
  252. MEMORY_WORKING_SET_RATE: m.MemoryWorkingSetRate,
  253. }
  254. }
  255. type PodProcessMetric struct {
  256. PodMetricMeta
  257. *CadvisorProcessMetric
  258. }
  259. func (m PodProcessMetric) GetName() string {
  260. return "pod_process"
  261. }
  262. type PodVolumeMetric struct {
  263. ContainerMetricMeta
  264. // 容器内挂载路径
  265. MountPath string `json:"mount_path"`
  266. // 宿主机路径
  267. HostPath string `json:"host_path"`
  268. Type string `json:"type"`
  269. Fstype string `json:"fstype"`
  270. Total uint64 `json:"total"`
  271. Free uint64 `json:"free"`
  272. Used uint64 `json:"used"`
  273. UsedPercent float64 `json:"used_percent"`
  274. InodesTotal uint64 `json:"inodes_total"`
  275. InodesUsed uint64 `json:"inodes_used"`
  276. InodesFree uint64 `json:"inodes_free"`
  277. InodesUsedPercent float64 `json:"inodes_used_percent"`
  278. Tags map[string]string `json:"tags"`
  279. }
  280. type CadvisorDiskIoMetric struct {
  281. Device string `json:"device"`
  282. //AsyncBytes uint64 `json:"async_bytes"`
  283. //DiscardBytes uint64 `json:"discard_bytes"`
  284. ReadBytes uint64 `json:"read_bytes"`
  285. WriteBytes uint64 `json:"write_bytes"`
  286. //TotalBytes uint64 `json:"total_bytes"`
  287. //AsyncCount uint64 `json:"async_count"`
  288. //DiscardCount uint64 `json:"discard_count"`
  289. ReadCount uint64 `json:"read_count"`
  290. WriteCount uint64 `json:"write_count"`
  291. //TotalCount uint64 `json:"total_count"`
  292. ReadIOPS float64 `json:"read_iops"`
  293. WriteIOPS float64 `json:"write_iops"`
  294. ReadBPS float64 `json:"read_Bps"`
  295. WriteBPS float64 `json:"write_Bps"`
  296. }
  297. func (m CadvisorDiskIoMetric) GetTag() map[string]string {
  298. return map[string]string{
  299. "device": m.Device,
  300. }
  301. }
  302. func (m CadvisorDiskIoMetric) ToMap() map[string]interface{} {
  303. return map[string]interface{}{
  304. "read_bytes": m.ReadBytes,
  305. "write_bytes": m.WriteBytes,
  306. "read_Bps": m.ReadBPS,
  307. "write_Bps": m.WriteBPS,
  308. "read_count": m.ReadCount,
  309. "write_count": m.WriteCount,
  310. "read_iops": m.ReadIOPS,
  311. "write_iops": m.WriteIOPS,
  312. }
  313. }
  314. type PodDiskIoMetrics map[string]*PodDiskIoMetric
  315. func newPodDiskIoMetrics(metrics map[string]CadvisorDiskIoMetric, meta PodMetricMeta) PodDiskIoMetrics {
  316. ret := make(map[string]*PodDiskIoMetric)
  317. for k, v := range metrics {
  318. ret[k] = &PodDiskIoMetric{
  319. PodMetricMeta: meta,
  320. CadvisorDiskIoMetric: v,
  321. }
  322. }
  323. return ret
  324. }
  325. func (m PodDiskIoMetrics) ToCadvisorDiskIoMetrics() map[string]CadvisorDiskIoMetric {
  326. ret := make(map[string]CadvisorDiskIoMetric)
  327. for k, v := range m {
  328. ret[k] = v.CadvisorDiskIoMetric
  329. }
  330. return ret
  331. }
  332. func (m PodDiskIoMetrics) GetTime() time.Time {
  333. for _, v := range m {
  334. return v.Time
  335. }
  336. return time.Time{}
  337. }
  338. type PodDiskIoMetric struct {
  339. PodMetricMeta
  340. CadvisorDiskIoMetric
  341. }
  342. func (m PodDiskIoMetric) GetName() string {
  343. return "pod_diskio"
  344. }
  345. func (m PodDiskIoMetric) GetTag() map[string]string {
  346. return m.CadvisorDiskIoMetric.GetTag()
  347. }
  348. func (m PodVolumeMetric) GetName() string {
  349. return "pod_volume"
  350. }
  351. func (m PodVolumeMetric) ToMap() map[string]interface{} {
  352. r := map[string]interface{}{
  353. VOLUME_TOTAL: m.Total,
  354. VOLUME_FREE: m.Free,
  355. VOLUME_USED: m.Used,
  356. VOLUME_USED_PERCENT: m.UsedPercent,
  357. VOLUME_INODES_TOTAL: m.InodesTotal,
  358. VOLUME_INODES_FREE: m.InodesFree,
  359. VOLUME_INODES_USED: m.InodesUsed,
  360. VOLUME_INODES_USED_PERCENT: m.InodesUsedPercent,
  361. }
  362. return r
  363. }
  364. func (m PodVolumeMetric) GetTag() map[string]string {
  365. baseTags := m.ContainerMetricMeta.GetTag()
  366. curTags := map[string]string{
  367. "mount_path": m.MountPath,
  368. "host_path": m.HostPath,
  369. "type": m.Type,
  370. }
  371. for k, v := range curTags {
  372. baseTags[k] = v
  373. }
  374. for k, v := range m.Tags {
  375. baseTags[k] = v
  376. }
  377. return baseTags
  378. }
  379. type ContainerDiskIoMetrics map[string]*ContainerDiskIoMetric
  380. func newContainerDiskioMetrics(metrics map[string]CadvisorDiskIoMetric, meta ContainerMetricMeta) ContainerDiskIoMetrics {
  381. ret := make(map[string]*ContainerDiskIoMetric)
  382. for k, v := range metrics {
  383. ret[k] = &ContainerDiskIoMetric{
  384. ContainerMetricMeta: meta,
  385. CadvisorDiskIoMetric: v,
  386. }
  387. }
  388. return ret
  389. }
  390. func (m ContainerDiskIoMetrics) ToCadvisorDiskIoMetrics() map[string]CadvisorDiskIoMetric {
  391. ret := make(map[string]CadvisorDiskIoMetric)
  392. for k, v := range m {
  393. ret[k] = v.CadvisorDiskIoMetric
  394. }
  395. return ret
  396. }
  397. type ContainerMetrics struct {
  398. ContainerCpu *ContainerCpuMetric `json:"container_cpu"`
  399. ContainerMemory *ContainerMemoryMetric `json:"container_memory"`
  400. ContainerProcess *ContainerProcessMetric `json:"container_process"`
  401. ContainerDiskIos ContainerDiskIoMetrics `json:"container_diskios"`
  402. }
  403. type ContainerMetricMeta struct {
  404. PodMetricMeta
  405. ContainerId string `json:"container_id"`
  406. ContainerName string `json:"container_name"`
  407. PodId string `json:"pod_id"`
  408. }
  409. func (m ContainerMetricMeta) GetTag() map[string]string {
  410. ret := map[string]string{
  411. "pod_id": m.PodId,
  412. "container_name": strings.ReplaceAll(m.ContainerName, " ", "+"),
  413. }
  414. if m.ContainerId != "" {
  415. ret["container_id"] = m.ContainerId
  416. }
  417. return ret
  418. }
  419. type ContainerMemoryMetric struct {
  420. ContainerMetricMeta
  421. MemoryWorkingSetBytes float64 `json:"memory_working_set_bytes"`
  422. MemoryWorkingSetRate float64 `json:"memory_working_set_rate"`
  423. MemoryUsageBytes float64 `json:"memory_usage_bytes"`
  424. MemoryUsageRate float64 `json:"memory_usage_rate"`
  425. }
  426. func (m ContainerMemoryMetric) GetName() string {
  427. return "container_mem"
  428. }
  429. func (m *ContainerMemoryMetric) ToMap() map[string]interface{} {
  430. return map[string]interface{}{
  431. MEMORY_WORKING_SET_BYTES: m.MemoryWorkingSetBytes,
  432. MEMORY_USAGE_BYTES: m.MemoryUsageBytes,
  433. MEMORY_USAGE_RATE: m.MemoryUsageRate,
  434. MEMORY_WORKING_SET_RATE: m.MemoryWorkingSetRate,
  435. }
  436. }
  437. type ContainerCpuMetric struct {
  438. ContainerMetricMeta
  439. CpuUsageSecondsTotal float64 `json:"cpu_usage_seconds_total"`
  440. CpuUsageRate *float64 `json:"cpu_usage_rate"`
  441. }
  442. func (m ContainerCpuMetric) GetName() string {
  443. return "container_cpu"
  444. }
  445. func (m *ContainerCpuMetric) ToMap() map[string]interface{} {
  446. ret := map[string]interface{}{
  447. CPU_USAGE_SECONDS_TOTAL: m.CpuUsageSecondsTotal,
  448. }
  449. if m.CpuUsageRate != nil {
  450. ret[CPU_USAGE_RATE] = *m.CpuUsageRate
  451. }
  452. return ret
  453. }
  454. type ContainerProcessMetric struct {
  455. ContainerMetricMeta
  456. *CadvisorProcessMetric
  457. }
  458. func (m ContainerProcessMetric) GetName() string {
  459. return "container_process"
  460. }
  461. type ContainerDiskIoMetric struct {
  462. ContainerMetricMeta
  463. CadvisorDiskIoMetric
  464. }
  465. func (m ContainerDiskIoMetric) GetName() string {
  466. return "container_diskio"
  467. }
  468. func (m *ContainerDiskIoMetric) GetTag() map[string]string {
  469. baseTags := m.ContainerMetricMeta.GetTag()
  470. for k, v := range m.CadvisorDiskIoMetric.GetTag() {
  471. baseTags[k] = v
  472. }
  473. return baseTags
  474. }
  475. func GetPodStatsById(ss []stats.PodStats, gpuPodProcs map[string]map[string]struct{}, podId string) (*stats.PodStats, map[string]struct{}) {
  476. var podStat *stats.PodStats
  477. for i := range ss {
  478. if ss[i].PodRef.UID == podId {
  479. podStat = &ss[i]
  480. break
  481. }
  482. }
  483. podProcs, _ := gpuPodProcs[podId]
  484. return podStat, podProcs
  485. }
  486. func GetPodNvidiaGpuMetrics(metrics []NvidiaGpuProcessMetrics, podProcs map[string]struct{}) []NvidiaGpuProcessMetrics {
  487. podMetrics := make([]NvidiaGpuProcessMetrics, 0)
  488. for i := range metrics {
  489. pid := metrics[i].Pid
  490. if _, ok := podProcs[pid]; ok {
  491. podMetrics = append(podMetrics, metrics[i])
  492. }
  493. }
  494. return podMetrics
  495. }
  496. func GetPodVastaitechGpuMetrics(metrics []VastaitechGpuProcessMetrics, podProcs map[string]struct{}) []VastaitechGpuProcessMetrics {
  497. podMetrics := make([]VastaitechGpuProcessMetrics, 0)
  498. for i := range metrics {
  499. pid := metrics[i].Pid
  500. if _, ok := podProcs[pid]; ok {
  501. podMetrics = append(podMetrics, metrics[i])
  502. }
  503. }
  504. return podMetrics
  505. }
  506. func GetPodCphAmdGpuMetrics(metrics []CphAmdGpuProcessMetrics, podProcs map[string]struct{}) []CphAmdGpuProcessMetrics {
  507. podMetrics := make([]CphAmdGpuProcessMetrics, 0)
  508. for i := range metrics {
  509. pid := metrics[i].Pid
  510. if _, ok := podProcs[pid]; ok {
  511. podMetrics = append(podMetrics, metrics[i])
  512. }
  513. }
  514. return podMetrics
  515. }
  516. func (s *SGuestMonitorCollector) collectPodMetrics(gm *SGuestMonitor, prevUsage *GuestMetrics) *GuestMetrics {
  517. gmData := new(GuestMetrics)
  518. gmData.PodMetrics = gm.PodMetrics(prevUsage)
  519. // netio
  520. gmData.VmNetio = gm.Netio()
  521. netio1 := gmData.VmNetio
  522. netio2 := prevUsage.VmNetio
  523. s.addNetio(netio1, netio2)
  524. return gmData
  525. }
  526. func NewContainerMetricMeta(serverId string, containerId string, containerName string, time time.Time) ContainerMetricMeta {
  527. return ContainerMetricMeta{
  528. PodMetricMeta: NewPodMetricMeta(time),
  529. ContainerId: containerId,
  530. PodId: serverId,
  531. ContainerName: containerName,
  532. }
  533. }
  534. func (m *SGuestMonitor) HasPodMetrics() bool {
  535. return m.podStat != nil
  536. }
  537. func (m *SGuestMonitor) getVolumeMetrics() []*PodVolumeMetric {
  538. pi := m.instance.(guestman.PodInstance)
  539. if !pi.IsRunning() {
  540. return nil
  541. }
  542. vus, err := pi.GetVolumeMountUsages()
  543. if err != nil {
  544. log.Warningf("get volume mount usages: %v", err)
  545. }
  546. result := make([]*PodVolumeMetric, 0)
  547. for i := range vus {
  548. vu := vus[i]
  549. ctr := pi.GetContainerById(vu.Id)
  550. if ctr == nil {
  551. log.Warningf("not found container by %s", vu.Id)
  552. continue
  553. }
  554. meta := NewContainerMetricMeta(pi.GetId(), vu.Id, ctr.Name, time.Now())
  555. result = append(result, &PodVolumeMetric{
  556. ContainerMetricMeta: meta,
  557. MountPath: vu.MountPath,
  558. HostPath: vu.HostPath,
  559. Type: vu.VolumeType,
  560. Fstype: vu.Usage.Fstype,
  561. Total: vu.Usage.Total,
  562. Free: vu.Usage.Free,
  563. Used: vu.Usage.Used,
  564. UsedPercent: vu.Usage.UsedPercent,
  565. InodesTotal: vu.Usage.InodesTotal,
  566. InodesUsed: vu.Usage.InodesUsed,
  567. InodesFree: vu.Usage.InodesFree,
  568. InodesUsedPercent: vu.Usage.InodesUsedPercent,
  569. Tags: vu.Tags,
  570. })
  571. }
  572. return result
  573. }
  574. func (m *SGuestMonitor) getCadvisorProcessMetric(stat *stats.ProcessStats) *CadvisorProcessMetric {
  575. if stat == nil {
  576. return nil
  577. }
  578. return &CadvisorProcessMetric{
  579. ProcessCount: stat.ProcessCount,
  580. FdCount: stat.FdCount,
  581. SocketCount: stat.SocketCount,
  582. ThreadsCurrent: stat.ThreadsCurrent,
  583. ThreadsMax: stat.ThreadsMax,
  584. }
  585. }
  586. func (m *SGuestMonitor) getCadvisorDiskIoMetrics(cur stats.DiskIoStats, prev map[string]CadvisorDiskIoMetric, curTime, prevTime time.Time) map[string]CadvisorDiskIoMetric {
  587. ret := make(map[string]CadvisorDiskIoMetric)
  588. for devName, stat := range cur {
  589. devR := CadvisorDiskIoMetric{
  590. Device: stat.DeviceName,
  591. ReadCount: stat.ReadCount,
  592. WriteCount: stat.WriteCount,
  593. ReadBytes: stat.ReadBytes,
  594. WriteBytes: stat.WriteBytes,
  595. }
  596. diffTime := float64(curTime.Sub(prevTime) / time.Second)
  597. if diffTime > 0 && prev != nil {
  598. prevStat, ok := prev[devName]
  599. if ok {
  600. // 检查计数器是否回退(可能是容器重启导致)
  601. if stat.ReadCount < prevStat.ReadCount || stat.WriteCount < prevStat.WriteCount ||
  602. stat.ReadBytes < prevStat.ReadBytes || stat.WriteBytes < prevStat.WriteBytes {
  603. log.Warningf("Disk IO counters decreased: guest=%s(%s), device=%s, ReadCount %d -> %d, WriteCount %d -> %d, ReadBytes %d -> %d, WriteBytes %d -> %d. Possible container restart, skipping rate calculation.",
  604. m.Name, m.Id, devName,
  605. prevStat.ReadCount, stat.ReadCount,
  606. prevStat.WriteCount, stat.WriteCount,
  607. prevStat.ReadBytes, stat.ReadBytes,
  608. prevStat.WriteBytes, stat.WriteBytes)
  609. // 跳过计算,避免产生负值
  610. } else {
  611. devR.ReadBPS = float64(stat.ReadBytes-prevStat.ReadBytes) / diffTime
  612. devR.WriteBPS = float64(stat.WriteBytes-prevStat.WriteBytes) / diffTime
  613. devR.ReadIOPS = float64(stat.ReadCount-prevStat.ReadCount) / diffTime
  614. devR.WriteIOPS = float64(stat.WriteCount-prevStat.WriteCount) / diffTime
  615. }
  616. }
  617. }
  618. ret[devName] = devR
  619. }
  620. return ret
  621. }
  622. func isPodContainerStopped(prevUsage *GuestMetrics, stat *stats.PodStats) bool {
  623. hasPrevUsage := prevUsage != nil && prevUsage.PodMetrics != nil
  624. if !hasPrevUsage {
  625. return false
  626. }
  627. curTime := stat.CPU.Time.Time
  628. podCpu := &PodCpuMetric{
  629. PodMetricMeta: NewPodMetricMeta(curTime),
  630. CpuUsageSecondsTotal: float64(*stat.CPU.UsageCoreNanoSeconds) / float64(time.Second),
  631. }
  632. pmPodCpu := prevUsage.PodMetrics.PodCpu
  633. if podCpu.CpuUsageSecondsTotal < pmPodCpu.CpuUsageSecondsTotal {
  634. return true
  635. }
  636. return false
  637. }
  638. func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics {
  639. stat := m.podStat
  640. if stat == nil || stat.CPU == nil || stat.CPU.UsageCoreNanoSeconds == nil || stat.Memory == nil || stat.Memory.WorkingSetBytes == nil || stat.Memory.UsageBytes == nil {
  641. log.Warningf("skip pod metrics for %s(%s): incomplete pod stats", m.Name, m.Id)
  642. return nil
  643. }
  644. curTime := stat.CPU.Time.Time
  645. podCpu := &PodCpuMetric{
  646. PodMetricMeta: NewPodMetricMeta(curTime),
  647. CpuUsageSecondsTotal: float64(*stat.CPU.UsageCoreNanoSeconds) / float64(time.Second),
  648. }
  649. hasPrevUsage := prevUsage != nil && prevUsage.PodMetrics != nil
  650. if hasPrevUsage {
  651. pmPodCpu := prevUsage.PodMetrics.PodCpu
  652. val := (podCpu.CpuUsageSecondsTotal - pmPodCpu.CpuUsageSecondsTotal) / podCpu.Time.Sub(pmPodCpu.Time).Seconds() * 100
  653. podCpu.CpuUsageRate = &val
  654. }
  655. podMemory := &PodMemoryMetric{
  656. MemoryWorkingSetBytes: float64(*stat.Memory.WorkingSetBytes),
  657. MemoryWorkingSetRate: (float64(*stat.Memory.WorkingSetBytes) / float64(m.MemMB*1024*1024)) * 100,
  658. MemoryUsageBytes: float64(*stat.Memory.UsageBytes),
  659. MemoryUsageRate: (float64(*stat.Memory.UsageBytes) / float64(m.MemMB*1024*1024)) * 100,
  660. }
  661. containers := make([]*ContainerMetrics, 0)
  662. for _, ctr := range stat.Containers {
  663. if ctr.CPU == nil || ctr.CPU.UsageCoreNanoSeconds == nil || ctr.Memory == nil || ctr.Memory.WorkingSetBytes == nil || ctr.Memory.UsageBytes == nil {
  664. log.Warningf("skip incomplete container stats in pod %s(%s), container=%s", m.Name, m.Id, ctr.Name)
  665. continue
  666. }
  667. ctrMeta := NewContainerMetricMeta(m.Id, "", ctr.Name, ctr.CPU.Time.Time)
  668. cm := &ContainerMetrics{
  669. ContainerCpu: &ContainerCpuMetric{
  670. ContainerMetricMeta: ctrMeta,
  671. CpuUsageSecondsTotal: float64(*ctr.CPU.UsageCoreNanoSeconds) / float64(time.Second),
  672. },
  673. ContainerMemory: &ContainerMemoryMetric{
  674. ContainerMetricMeta: ctrMeta,
  675. MemoryWorkingSetBytes: float64(*ctr.Memory.WorkingSetBytes),
  676. MemoryWorkingSetRate: (float64(*ctr.Memory.WorkingSetBytes) / float64(m.MemMB*1024*1024)) * 100,
  677. MemoryUsageBytes: float64(*ctr.Memory.UsageBytes),
  678. MemoryUsageRate: (float64(*ctr.Memory.UsageBytes) / float64(m.MemMB*1024*1024)) * 100,
  679. },
  680. }
  681. var prevCtrM *ContainerMetrics
  682. if hasPrevUsage {
  683. for _, prevCtr := range prevUsage.PodMetrics.Containers {
  684. if prevCtr.ContainerCpu.ContainerName == ctr.Name {
  685. prevCtrM = prevCtr
  686. val := (cm.ContainerCpu.CpuUsageSecondsTotal - prevCtr.ContainerCpu.CpuUsageSecondsTotal) / cm.ContainerCpu.Time.Sub(prevCtr.ContainerCpu.Time).Seconds() * 100
  687. cm.ContainerCpu.CpuUsageRate = &val
  688. break
  689. }
  690. }
  691. }
  692. if ctr.ProcessStats != nil {
  693. cm.ContainerProcess = &ContainerProcessMetric{
  694. ContainerMetricMeta: ctrMeta,
  695. CadvisorProcessMetric: m.getCadvisorProcessMetric(ctr.ProcessStats),
  696. }
  697. }
  698. if ctr.DiskIo != nil {
  699. var prevStat map[string]CadvisorDiskIoMetric
  700. var prevTime = ctrMeta.Time
  701. if prevCtrM != nil {
  702. if prevCtrM.ContainerDiskIos != nil {
  703. prevStat = prevCtrM.ContainerDiskIos.ToCadvisorDiskIoMetrics()
  704. prevTime = prevCtrM.ContainerCpu.Time
  705. }
  706. }
  707. cm.ContainerDiskIos = newContainerDiskioMetrics(m.getCadvisorDiskIoMetrics(ctr.DiskIo, prevStat, ctrMeta.Time, prevTime), ctrMeta)
  708. }
  709. containers = append(containers, cm)
  710. }
  711. var podProcess *PodProcessMetric
  712. if stat.ProcessStats != nil {
  713. podProcess = &PodProcessMetric{
  714. CadvisorProcessMetric: m.getCadvisorProcessMetric(stat.ProcessStats),
  715. }
  716. }
  717. pm := &PodMetrics{
  718. PodCpu: podCpu,
  719. PodMemory: podMemory,
  720. PodProcess: podProcess,
  721. PodVolumes: m.getVolumeMetrics(),
  722. PodNvidiaGpu: m.getPodNvidiaGpuMetrics(),
  723. PodVastaitechGpu: m.getPodVastaitechGpuMetrics(),
  724. PodCphAmdGpu: m.getPodCphAmdGpuMetrics(),
  725. Containers: containers,
  726. }
  727. if stat.DiskIo != nil {
  728. var prevStat map[string]CadvisorDiskIoMetric
  729. var prevTime = curTime
  730. if hasPrevUsage {
  731. pd := prevUsage.PodMetrics.PodDiskIos
  732. if pd != nil && len(pd) != 0 {
  733. prevStat = pd.ToCadvisorDiskIoMetrics()
  734. prevTime = pd.GetTime()
  735. }
  736. }
  737. podMeta := NewPodMetricMeta(curTime)
  738. pm.PodDiskIos = newPodDiskIoMetrics(m.getCadvisorDiskIoMetrics(stat.DiskIo, prevStat, curTime, prevTime), podMeta)
  739. }
  740. return pm
  741. }
  742. func (m *SGuestMonitor) getPodCphAmdGpuMetrics() []*PodCphAmdGpuMetrics {
  743. if len(m.cphAmdGpuMetrics) == 0 {
  744. return nil
  745. }
  746. addrGpuMap := map[string]*PodCphAmdGpuMetrics{}
  747. for i := range m.cphAmdGpuMetrics {
  748. devId := m.cphAmdGpuMetrics[i].DevId
  749. gms, ok := addrGpuMap[devId]
  750. if !ok {
  751. gms = new(PodCphAmdGpuMetrics)
  752. gms.DevId = devId
  753. }
  754. gms.Mem += m.cphAmdGpuMetrics[i].Mem
  755. gms.MemUtil += m.cphAmdGpuMetrics[i].MemUtil
  756. addrGpuMap[devId] = gms
  757. }
  758. res := make([]*PodCphAmdGpuMetrics, 0)
  759. for _, gms := range addrGpuMap {
  760. res = append(res, gms)
  761. }
  762. return res
  763. }
  764. func (m *SGuestMonitor) getPodVastaitechGpuMetrics() []*PodVastaitechGpuMetrics {
  765. if len(m.vastaitechGpuMetrics) == 0 {
  766. return nil
  767. }
  768. addrGpuMap := map[string]*PodVastaitechGpuMetrics{}
  769. for i := range m.vastaitechGpuMetrics {
  770. pciAddr := m.vastaitechGpuMetrics[i].PciAddr
  771. gms, ok := addrGpuMap[pciAddr]
  772. if !ok {
  773. gms = new(PodVastaitechGpuMetrics)
  774. gms.DevId = m.vastaitechGpuMetrics[i].DevId
  775. gms.PciAddr = m.vastaitechGpuMetrics[i].PciAddr
  776. }
  777. gms.Mem += m.vastaitechGpuMetrics[i].GfxMem
  778. gms.MemUtil += m.vastaitechGpuMetrics[i].GfxMemUsage
  779. gms.Gfx += m.vastaitechGpuMetrics[i].Gfx
  780. gms.DecUtil += m.vastaitechGpuMetrics[i].Dec
  781. gms.EncUtil += m.vastaitechGpuMetrics[i].Enc
  782. addrGpuMap[pciAddr] = gms
  783. }
  784. res := make([]*PodVastaitechGpuMetrics, 0)
  785. for _, gms := range addrGpuMap {
  786. res = append(res, gms)
  787. }
  788. return res
  789. }
  790. func (m *SGuestMonitor) getPodNvidiaGpuMetrics() []*PodNvidiaGpuMetrics {
  791. if len(m.nvidiaGpuMetrics) == 0 {
  792. return nil
  793. }
  794. indexGpuMap := map[int]*PodNvidiaGpuMetrics{}
  795. for i := range m.nvidiaGpuMetrics {
  796. index := m.nvidiaGpuMetrics[i].Index
  797. gms, ok := indexGpuMap[index]
  798. if !ok {
  799. gms = new(PodNvidiaGpuMetrics)
  800. }
  801. gms.Framebuffer += m.nvidiaGpuMetrics[i].FB
  802. gms.Ccpm += m.nvidiaGpuMetrics[i].Ccpm
  803. gms.SmUtil += m.nvidiaGpuMetrics[i].Sm
  804. gms.EncUtil += m.nvidiaGpuMetrics[i].Enc
  805. gms.DecUtil += m.nvidiaGpuMetrics[i].Dec
  806. gms.JpgUtil += m.nvidiaGpuMetrics[i].Jpg
  807. gms.OfaUtil += m.nvidiaGpuMetrics[i].Ofa
  808. indexGpuMap[index] = gms
  809. }
  810. indexs := make([]int, 0)
  811. for index, gms := range indexGpuMap {
  812. indexs = append(indexs, index)
  813. indexStr := strconv.Itoa(index)
  814. memSizeTotal, ok := m.nvidiaGpuIndexMemoryMap[indexStr]
  815. if !ok {
  816. continue
  817. }
  818. gms.MemTotal = memSizeTotal
  819. gms.Mem = gms.Framebuffer
  820. gms.MemUtil = float64(gms.Framebuffer) / float64(gms.MemTotal)
  821. }
  822. sort.Ints(indexs)
  823. res := make([]*PodNvidiaGpuMetrics, len(indexs))
  824. for i := range indexs {
  825. gms := indexGpuMap[indexs[i]]
  826. gms.PhysicalIndex = gms.Index
  827. gms.Index = i
  828. res[i] = gms
  829. }
  830. return res
  831. }
  832. type iPodMetric interface {
  833. GetName() string
  834. GetTag() map[string]string
  835. ToMap() map[string]interface{}
  836. }
  837. type iPodUniformName interface {
  838. GetUniformName() string
  839. }
  840. func (d *GuestMetrics) toPodTelegrafData(tagStr string) []string {
  841. m := d.PodMetrics
  842. ims := []iPodMetric{m.PodCpu, m.PodMemory}
  843. for i := range m.PodVolumes {
  844. ims = append(ims, m.PodVolumes[i])
  845. }
  846. if m.PodProcess != nil {
  847. ims = append(ims, m.PodProcess)
  848. }
  849. if m.PodDiskIos != nil {
  850. for _, d := range m.PodDiskIos {
  851. ims = append(ims, d)
  852. }
  853. }
  854. for i := range m.PodNvidiaGpu {
  855. ims = append(ims, m.PodNvidiaGpu[i])
  856. }
  857. for i := range m.PodVastaitechGpu {
  858. ims = append(ims, m.PodVastaitechGpu[i])
  859. }
  860. for i := range m.PodCphAmdGpu {
  861. ims = append(ims, m.PodCphAmdGpu[i])
  862. }
  863. for _, c := range m.Containers {
  864. ims = append(ims, c.ContainerCpu)
  865. ims = append(ims, c.ContainerMemory)
  866. if c.ContainerProcess != nil {
  867. ims = append(ims, c.ContainerProcess)
  868. }
  869. for _, cd := range c.ContainerDiskIos {
  870. ims = append(ims, cd)
  871. }
  872. }
  873. res := []string{}
  874. for _, im := range ims {
  875. tagMap := im.GetTag()
  876. newTagStr := tagStr
  877. if len(tagMap) != 0 {
  878. var newTagArr []string
  879. for k, v := range tagMap {
  880. newTagArr = append(newTagArr, fmt.Sprintf("%s=%s", k, v))
  881. }
  882. newTagStr = strings.Join([]string{tagStr, strings.Join(newTagArr, ",")}, ",")
  883. }
  884. res = append(res, fmt.Sprintf("%s,%s %s", im.GetName(), newTagStr, d.mapToStatStr(im.ToMap())))
  885. if imu, ok := im.(iPodUniformName); ok {
  886. if un := imu.GetUniformName(); un != "" {
  887. res = append(res, fmt.Sprintf("%s,%s %s", un, newTagStr, d.mapToStatStr(im.ToMap())))
  888. }
  889. }
  890. }
  891. res = append(res, d.netioToTelegrafData("pod_netio", tagStr)...)
  892. return res
  893. }