container_nvidia_gpu_metrics.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package hostmetrics
  15. import (
  16. "fmt"
  17. "os"
  18. "path"
  19. "strconv"
  20. "strings"
  21. "yunion.io/x/log"
  22. "yunion.io/x/pkg/errors"
  23. "yunion.io/x/pkg/utils"
  24. "yunion.io/x/onecloud/pkg/apis/compute"
  25. "yunion.io/x/onecloud/pkg/hostman/guestman"
  26. "yunion.io/x/onecloud/pkg/util/cgrouputils"
  27. "yunion.io/x/onecloud/pkg/util/procutils"
  28. )
  29. type NvidiaGpuProcessMetrics struct {
  30. Index int // Gpu Index
  31. Pid string // Process ID
  32. Type string // Process Type C/G, Compute or Graphics
  33. FB int // Framebuffer Memory Usage
  34. Ccpm int // Current CUDA Contexts Per Measurement
  35. Sm float64 // Streaming Multiprocessor Utilization
  36. Mem float64 // Memory Utilization
  37. Enc float64 // Encoder Utilization
  38. Dec float64 // Decoder Utilization
  39. Jpg float64 // JPEG Decoder Utilization
  40. Ofa float64 // Other Feature Utilization
  41. Command string // Process Command Name
  42. }
  43. func GetNvidiaGpuProcessMetrics() ([]NvidiaGpuProcessMetrics, error) {
  44. cmd := "nvidia-smi pmon -s mu -c 1"
  45. output, err := procutils.NewRemoteCommandAsFarAsPossible("bash", "-c", cmd).Output()
  46. if err != nil {
  47. return nil, errors.Wrapf(err, "Execute %s failed: %s", cmd, output)
  48. }
  49. return parseNvidiaGpuProcessMetrics(string(output)), nil
  50. }
  51. /*
  52. # gpu pid type fb ccpm sm mem enc dec jpg ofa command
  53. # Idx # C/G MB MB % % % % % % name
  54. */
  55. func parseNvidiaGpuProcessMetrics(gpuMetricsStr string) []NvidiaGpuProcessMetrics {
  56. gpuProcessMetrics := make([]NvidiaGpuProcessMetrics, 0)
  57. lines := strings.Split(gpuMetricsStr, "\n")
  58. for _, line := range lines {
  59. // Skip comments and blank lines
  60. if strings.HasPrefix(line, "#") || len(strings.TrimSpace(line)) == 0 {
  61. continue
  62. }
  63. var processMetrics NvidiaGpuProcessMetrics
  64. var fb, ccpm, sm, mem, enc, dec, jpg, ofa string
  65. _, err := fmt.Sscanf(line, "%d %s %s %s %s %s %s %s %s %s %s %s",
  66. &processMetrics.Index, &processMetrics.Pid, &processMetrics.Type, &fb, &ccpm,
  67. &sm, &mem, &enc, &dec, &jpg, &ofa, &processMetrics.Command)
  68. if err != nil {
  69. log.Errorf("failed parse nvidia gpu metrics %s: %s", line, err)
  70. continue
  71. }
  72. if processMetrics.Command == "nvidia-cuda-mps" || processMetrics.Command == "-" {
  73. continue
  74. }
  75. if fb != "-" {
  76. val, err := strconv.Atoi(fb)
  77. if err != nil {
  78. log.Errorf("failed parse sm %s: %s", sm, err)
  79. }
  80. processMetrics.FB = val
  81. }
  82. if ccpm != "-" {
  83. val, err := strconv.Atoi(ccpm)
  84. if err != nil {
  85. log.Errorf("failed parse sm %s: %s", sm, err)
  86. }
  87. processMetrics.Ccpm = val
  88. }
  89. if sm != "-" {
  90. val, err := strconv.ParseFloat(sm, 64)
  91. if err != nil {
  92. log.Errorf("failed parse sm %s: %s", sm, err)
  93. }
  94. processMetrics.Sm = val
  95. }
  96. if mem != "-" {
  97. val, err := strconv.ParseFloat(mem, 64)
  98. if err != nil {
  99. log.Errorf("failed parse mem %s: %s", mem, err)
  100. }
  101. processMetrics.Mem = val
  102. }
  103. if enc != "-" {
  104. val, err := strconv.ParseFloat(enc, 64)
  105. if err != nil {
  106. log.Errorf("failed parse enc %s: %s", enc, err)
  107. }
  108. processMetrics.Enc = val
  109. }
  110. if dec != "-" {
  111. val, err := strconv.ParseFloat(dec, 64)
  112. if err != nil {
  113. log.Errorf("failed parse dec %s: %s", dec, err)
  114. }
  115. processMetrics.Dec = val
  116. }
  117. if jpg != "-" {
  118. val, err := strconv.ParseFloat(jpg, 64)
  119. if err != nil {
  120. log.Errorf("failed parse jpg %s: %s", jpg, err)
  121. }
  122. processMetrics.Jpg = val
  123. }
  124. if ofa != "-" {
  125. val, err := strconv.ParseFloat(ofa, 64)
  126. if err != nil {
  127. log.Errorf("failed parse ofa %s: %s", ofa, err)
  128. }
  129. processMetrics.Ofa = val
  130. }
  131. gpuProcessMetrics = append(gpuProcessMetrics, processMetrics)
  132. }
  133. return gpuProcessMetrics
  134. }
  135. func (s *SGuestMonitorCollector) collectGpuPodsProcesses() map[string]map[string]struct{} {
  136. podProcIds := map[string]map[string]struct{}{}
  137. guestmanager := guestman.GetGuestManager()
  138. cgroupRoot := path.Join(cgrouputils.GetSubModulePath("cpuset"), guestman.PodCgroupParent())
  139. guestmanager.Servers.Range(func(k, v interface{}) bool {
  140. pod, ok := v.(guestman.PodInstance)
  141. if !ok {
  142. return true
  143. }
  144. if !pod.IsRunning() {
  145. return true
  146. }
  147. podDesc := pod.GetDesc()
  148. hasGpu := false
  149. for i := range podDesc.IsolatedDevices {
  150. if utils.IsInStringArray(podDesc.IsolatedDevices[i].DevType, compute.CONTAINER_GPU_TYPES) {
  151. hasGpu = true
  152. break
  153. }
  154. }
  155. if !hasGpu {
  156. return true
  157. }
  158. criIds := pod.GetPodContainerCriIds()
  159. procs := map[string]struct{}{}
  160. for i := range criIds {
  161. cgroupPath := path.Join(cgroupRoot, criIds[i], "cgroup.procs")
  162. pids, err := ReadProccessFromCgroupProcs(cgroupPath)
  163. if err != nil {
  164. log.Errorf("collectGpuPodsProcesses: %s", err)
  165. continue
  166. }
  167. for _, pid := range pids {
  168. procs[pid] = struct{}{}
  169. }
  170. }
  171. if len(procs) > 0 {
  172. podProcIds[pod.GetId()] = procs
  173. }
  174. return true
  175. })
  176. return podProcIds
  177. }
  178. func ReadProccessFromCgroupProcs(procFilePath string) ([]string, error) {
  179. out, err := os.ReadFile(procFilePath)
  180. if err != nil {
  181. return nil, errors.Wrap(err, "os.ReadFile")
  182. }
  183. pids := strings.Split(string(out), "\n")
  184. return pids, nil
  185. }