container_cph_amd_gpu_metrics.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // you may not use this file except in compliance with the License.
  15. // You may obtain a copy of the License at
  16. //
  17. // http://www.apache.org/licenses/LICENSE-2.0
  18. //
  19. // Unless required by applicable law or agreed to in writing, software
  20. // distributed under the License is distributed on an "AS IS" BASIS,
  21. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22. // See the License for the specific language governing permissions and
  23. // limitations under the License.
  24. package hostmetrics
  25. import (
  26. "os"
  27. "path"
  28. "regexp"
  29. "strconv"
  30. "strings"
  31. "yunion.io/x/log"
  32. "yunion.io/x/pkg/errors"
  33. "yunion.io/x/onecloud/pkg/util/fileutils2"
  34. "yunion.io/x/onecloud/pkg/util/procutils"
  35. )
  36. type CphAmdGpuProcessMetrics struct {
  37. Pid string // Process ID
  38. DevId string
  39. Mem float64 // Memory Utilization
  40. MemUtil float64
  41. }
  42. /*
  43. pid 2088269 command allocator@2.0-s:
  44. 0x00000001: 4096 byte GTT CPU_ACCESS_REQUIRED
  45. 0x00000002: 2097152 byte GTT CPU_ACCESS_REQUIRED
  46. 0x00000003: 2097152 byte VRAM VRAM_CLEARED
  47. 0x00000004: 2097152 byte VRAM NO_CPU_ACCESS VRAM_CLEARED
  48. 0x00000006: 2097152 byte GTT CPU_ACCESS_REQUIRED VRAM_CLEARED
  49. 0x00000007: 2097152 byte GTT CPU_ACCESS_REQUIRED VRAM_CLEARED
  50. */
  51. func GetCphAmdGpuProcessMetrics() ([]CphAmdGpuProcessMetrics, error) {
  52. debugDriDir := "/sys/kernel/debug/dri"
  53. entrys, err := os.ReadDir(debugDriDir)
  54. if err != nil {
  55. return nil, errors.Wrap(err, "os.ReadDir")
  56. }
  57. res := make([]CphAmdGpuProcessMetrics, 0)
  58. for i := range entrys {
  59. if entrys[i].IsDir() {
  60. fpath := path.Join(debugDriDir, entrys[i].Name(), "amdgpu_gem_info")
  61. if fileutils2.Exists(fpath) {
  62. content, err := fileutils2.FileGetContents(fpath)
  63. if err != nil {
  64. log.Errorf("failed FileGetContents %s: %s", fpath, err)
  65. continue
  66. }
  67. vramInfoPath := path.Join(debugDriDir, entrys[i].Name(), "amdgpu_vram_mm")
  68. memTotalSize, err := getVramTotalSizeMb(vramInfoPath)
  69. if err != nil {
  70. log.Errorf("failed getVramTotalSizeMb %s", err)
  71. }
  72. metrics := parseCphAmdGpuGemInfo(content, entrys[i].Name(), memTotalSize)
  73. if len(metrics) > 0 {
  74. res = append(res, metrics...)
  75. }
  76. }
  77. }
  78. }
  79. return res, nil
  80. }
  81. var pagesRe = regexp.MustCompile(`man size:(\d+) pages`)
  82. // man size:8384512 pages, ram usage:3745MB, vis usage:241MB
  83. func getVramTotalSizeMb(vramInfoPath string) (int, error) {
  84. if !fileutils2.Exists(vramInfoPath) {
  85. return 0, nil
  86. }
  87. out, err := procutils.NewCommand("tail", "-n", "1", vramInfoPath).Output()
  88. if err != nil {
  89. return 0, errors.Wrapf(err, "tail -n 1 %s", vramInfoPath)
  90. }
  91. str := strings.TrimSpace(string(out))
  92. matches := pagesRe.FindStringSubmatch(str)
  93. if len(matches) > 1 {
  94. pages, err := strconv.Atoi(matches[1])
  95. if err != nil {
  96. return 0, errors.Wrapf(err, " failed parse pages count %s", matches[0])
  97. }
  98. return pages * 4 * 1024 / 1024 / 1024, nil
  99. }
  100. return 0, errors.Errorf("failed parse pages count: %s", str)
  101. }
  102. func parseCphAmdGpuGemInfo(content string, devId string, memTotalSizeMB int) []CphAmdGpuProcessMetrics {
  103. res := make([]CphAmdGpuProcessMetrics, 0)
  104. lines := strings.Split(content, "\n")
  105. var i, length = 0, len(lines)
  106. for i < length {
  107. line := strings.TrimSpace(lines[i])
  108. segs := strings.Fields(line)
  109. if len(segs) < 2 {
  110. i++
  111. continue
  112. }
  113. if segs[0] != "pid" {
  114. i++
  115. continue
  116. }
  117. pid := segs[1]
  118. var vramTotal int64 = 0
  119. j := i + 1
  120. for j < length {
  121. line := strings.TrimSpace(lines[j])
  122. if len(line) == 0 {
  123. break
  124. }
  125. segs := strings.Fields(line)
  126. if len(segs) < 4 {
  127. log.Errorf("unknown output line %s", line)
  128. break
  129. }
  130. if segs[0] == "pid" {
  131. break
  132. }
  133. memUsedStr, memType := segs[1], segs[3]
  134. if memType == "VRAM" {
  135. memUsed, err := strconv.ParseInt(memUsedStr, 10, 64)
  136. if err != nil {
  137. log.Errorf("failed parse memused %s %s: %s", line, memUsedStr, err)
  138. break
  139. }
  140. vramTotal += memUsed
  141. }
  142. j++
  143. }
  144. memSize := float64(vramTotal) / 1024.0 / 1024.0
  145. res = append(res, CphAmdGpuProcessMetrics{
  146. Pid: pid,
  147. DevId: devId,
  148. Mem: memSize,
  149. MemUtil: memSize / float64(memTotalSizeMB) * 100.0,
  150. })
  151. i = j
  152. }
  153. return res
  154. }