nvidia.go 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. // Copyright 2017 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package accelerators
  15. import (
  16. "bufio"
  17. "fmt"
  18. "io/ioutil"
  19. "os"
  20. "path/filepath"
  21. "strconv"
  22. "strings"
  23. "sync"
  24. "time"
  25. "github.com/google/cadvisor/container"
  26. info "github.com/google/cadvisor/info/v1"
  27. "github.com/google/cadvisor/stats"
  28. "github.com/mindprince/gonvml"
  29. "k8s.io/klog/v2"
  30. )
  31. type nvidiaManager struct {
  32. sync.Mutex
  33. // true if there are NVIDIA devices present on the node
  34. devicesPresent bool
  35. // true if the NVML library (libnvidia-ml.so.1) was loaded successfully
  36. nvmlInitialized bool
  37. // nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
  38. nvidiaDevices map[int]gonvml.Device
  39. }
  40. var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
  41. const nvidiaVendorID = "0x10de"
  42. func NewNvidiaManager(includedMetrics container.MetricSet) stats.Manager {
  43. if !includedMetrics.Has(container.AcceleratorUsageMetrics) {
  44. klog.V(2).Info("NVIDIA GPU metrics disabled")
  45. return &stats.NoopManager{}
  46. }
  47. manager := &nvidiaManager{}
  48. err := manager.setup()
  49. if err != nil {
  50. klog.V(2).Infof("NVIDIA setup failed: %s", err)
  51. }
  52. return manager
  53. }
  54. // setup initializes NVML if NVIDIA devices are present on the node.
  55. func (nm *nvidiaManager) setup() error {
  56. if !detectDevices(nvidiaVendorID) {
  57. return fmt.Errorf("no NVIDIA devices found")
  58. }
  59. nm.devicesPresent = true
  60. return initializeNVML(nm)
  61. }
  62. // detectDevices returns true if a device with given pci id is present on the node.
  63. func detectDevices(vendorID string) bool {
  64. devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
  65. if err != nil {
  66. klog.Warningf("Error reading %q: %v", sysFsPCIDevicesPath, err)
  67. return false
  68. }
  69. for _, device := range devices {
  70. vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
  71. content, err := ioutil.ReadFile(vendorPath)
  72. if err != nil {
  73. klog.V(4).Infof("Error while reading %q: %v", vendorPath, err)
  74. continue
  75. }
  76. if strings.EqualFold(strings.TrimSpace(string(content)), vendorID) {
  77. klog.V(3).Infof("Found device with vendorID %q", vendorID)
  78. return true
  79. }
  80. }
  81. return false
  82. }
  83. // initializeNVML initializes the NVML library and sets up the nvmlDevices map.
  84. // This is defined as a variable to help in testing.
  85. var initializeNVML = func(nm *nvidiaManager) error {
  86. if err := gonvml.Initialize(); err != nil {
  87. // This is under a logging level because otherwise we may cause
  88. // log spam if the drivers/nvml is not installed on the system.
  89. return fmt.Errorf("Could not initialize NVML: %v", err)
  90. }
  91. nm.nvmlInitialized = true
  92. numDevices, err := gonvml.DeviceCount()
  93. if err != nil {
  94. return fmt.Errorf("GPU metrics would not be available. Failed to get the number of NVIDIA devices: %v", err)
  95. }
  96. if numDevices == 0 {
  97. return nil
  98. }
  99. klog.V(1).Infof("NVML initialized. Number of NVIDIA devices: %v", numDevices)
  100. nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
  101. for i := 0; i < int(numDevices); i++ {
  102. device, err := gonvml.DeviceHandleByIndex(uint(i))
  103. if err != nil {
  104. return fmt.Errorf("Failed to get NVIDIA device handle %d: %v", i, err)
  105. }
  106. minorNumber, err := device.MinorNumber()
  107. if err != nil {
  108. return fmt.Errorf("Failed to get NVIDIA device minor number: %v", err)
  109. }
  110. nm.nvidiaDevices[int(minorNumber)] = device
  111. }
  112. return nil
  113. }
  114. // Destroy shuts down NVML.
  115. func (nm *nvidiaManager) Destroy() {
  116. if nm.nvmlInitialized {
  117. err := gonvml.Shutdown()
  118. if err != nil {
  119. klog.Warningf("nvml library shutdown failed: %s", err)
  120. }
  121. }
  122. }
  123. // GetCollector returns a collector that can fetch NVIDIA gpu metrics for NVIDIA devices
  124. // present in the devices.list file in the given devicesCgroupPath.
  125. func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
  126. nc := &nvidiaCollector{}
  127. if !nm.devicesPresent {
  128. return &stats.NoopCollector{}, nil
  129. }
  130. // Makes sure that we don't call initializeNVML() concurrently and
  131. // that we only call initializeNVML() when it's not initialized.
  132. nm.Lock()
  133. if !nm.nvmlInitialized {
  134. err := initializeNVML(nm)
  135. if err != nil {
  136. nm.Unlock()
  137. return &stats.NoopCollector{}, err
  138. }
  139. }
  140. nm.Unlock()
  141. if len(nm.nvidiaDevices) == 0 {
  142. return &stats.NoopCollector{}, nil
  143. }
  144. nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
  145. if err != nil {
  146. return &stats.NoopCollector{}, err
  147. }
  148. for _, minor := range nvidiaMinorNumbers {
  149. device, ok := nm.nvidiaDevices[minor]
  150. if !ok {
  151. return &stats.NoopCollector{}, fmt.Errorf("NVIDIA device minor number %d not found in cached devices", minor)
  152. }
  153. nc.devices = append(nc.devices, device)
  154. }
  155. return nc, nil
  156. }
  157. // parseDevicesCgroup parses the devices cgroup devices.list file for the container
  158. // and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
  159. // container is allowed to access. In cases where the container has access to all
  160. // devices or all NVIDIA devices but the devices are not enumerated separately in
  161. // the devices.list file, we return an empty list.
  162. // This is defined as a variable to help in testing.
  163. var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
  164. // Always return a non-nil slice
  165. nvidiaMinorNumbers := []int{}
  166. devicesList := filepath.Join(devicesCgroupPath, "devices.list")
  167. f, err := os.Open(devicesList)
  168. if err != nil {
  169. return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
  170. }
  171. defer f.Close()
  172. s := bufio.NewScanner(f)
  173. // See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
  174. for s.Scan() {
  175. text := s.Text()
  176. fields := strings.Fields(text)
  177. if len(fields) != 3 {
  178. return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
  179. }
  180. // Split the second field to find out major:minor numbers
  181. majorMinor := strings.Split(fields[1], ":")
  182. if len(majorMinor) != 2 {
  183. return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
  184. }
  185. // NVIDIA graphics devices are character devices with major number 195.
  186. // https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
  187. if fields[0] == "c" && majorMinor[0] == "195" {
  188. minorNumber, err := strconv.Atoi(majorMinor[1])
  189. if err != nil {
  190. return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
  191. }
  192. // We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
  193. if minorNumber < 128 {
  194. nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
  195. }
  196. // We are ignoring the "195:*" case
  197. // where the container has access to all NVIDIA devices on the machine.
  198. }
  199. // We are ignoring the "*:*" case
  200. // where the container has access to all devices on the machine.
  201. }
  202. return nvidiaMinorNumbers, nil
  203. }
  204. type nvidiaCollector struct {
  205. // Exposed for testing
  206. devices []gonvml.Device
  207. stats.NoopDestroy
  208. }
  209. func NewNvidiaCollector(devices []gonvml.Device) stats.Collector {
  210. return &nvidiaCollector{devices: devices}
  211. }
  212. // UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
  213. func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
  214. for _, device := range nc.devices {
  215. model, err := device.Name()
  216. if err != nil {
  217. return fmt.Errorf("error while getting gpu name: %v", err)
  218. }
  219. uuid, err := device.UUID()
  220. if err != nil {
  221. return fmt.Errorf("error while getting gpu uuid: %v", err)
  222. }
  223. memoryTotal, memoryUsed, err := device.MemoryInfo()
  224. if err != nil {
  225. return fmt.Errorf("error while getting gpu memory info: %v", err)
  226. }
  227. //TODO: Use housekeepingInterval
  228. utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
  229. if err != nil {
  230. return fmt.Errorf("error while getting gpu utilization: %v", err)
  231. }
  232. stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
  233. Make: "nvidia",
  234. Model: model,
  235. ID: uuid,
  236. MemoryTotal: memoryTotal,
  237. MemoryUsed: memoryUsed,
  238. DutyCycle: uint64(utilizationGPU),
  239. })
  240. }
  241. return nil
  242. }