nvidia_gpu.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package container_device
  15. import (
  16. "fmt"
  17. "strconv"
  18. "strings"
  19. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
  20. "yunion.io/x/log"
  21. "yunion.io/x/pkg/errors"
  22. "yunion.io/x/pkg/util/sets"
  23. hostapi "yunion.io/x/onecloud/pkg/apis/host"
  24. "yunion.io/x/onecloud/pkg/hostman/isolated_device"
  25. "yunion.io/x/onecloud/pkg/util/procutils"
  26. )
  27. func init() {
  28. isolated_device.RegisterContainerDeviceManager(newNvidiaGPUManager())
  29. }
  30. type nvidiaGPUManager struct{}
  31. func newNvidiaGPUManager() *nvidiaGPUManager {
  32. return &nvidiaGPUManager{}
  33. }
  34. func (m *nvidiaGPUManager) GetType() isolated_device.ContainerDeviceType {
  35. return isolated_device.ContainerDeviceTypeNvidiaGpu
  36. }
  37. func (m *nvidiaGPUManager) ProbeDevices() ([]isolated_device.IDevice, error) {
  38. return probeNvidiaGpus()
  39. }
  40. func (m *nvidiaGPUManager) NewDevices(dev *isolated_device.ContainerDevice) ([]isolated_device.IDevice, error) {
  41. return nil, nil
  42. }
  43. func (m *nvidiaGPUManager) NewContainerDevices(input *hostapi.ContainerCreateInput, dev *hostapi.ContainerDevice) ([]*runtimeapi.Device, []*runtimeapi.Device, error) {
  44. return nil, nil, nil
  45. }
  46. func (m *nvidiaGPUManager) GetContainerExtraConfigures(devs []*hostapi.ContainerDevice) ([]*runtimeapi.KeyValue, []*runtimeapi.Mount) {
  47. gpuIds := []string{}
  48. for _, dev := range devs {
  49. if dev.IsolatedDevice == nil {
  50. continue
  51. }
  52. types := sets.NewString(
  53. string(isolated_device.ContainerDeviceTypeNvidiaGpu),
  54. string(isolated_device.ContainerDeviceTypeNvidiaGpuShare),
  55. )
  56. if !types.Has(dev.IsolatedDevice.DeviceType) {
  57. continue
  58. }
  59. gpuIds = append(gpuIds, dev.IsolatedDevice.Path)
  60. }
  61. if len(gpuIds) == 0 {
  62. return nil, nil
  63. }
  64. retEnvs := []*runtimeapi.KeyValue{}
  65. if len(gpuIds) > 0 {
  66. retEnvs = append(retEnvs, []*runtimeapi.KeyValue{
  67. {
  68. Key: "NVIDIA_VISIBLE_DEVICES",
  69. Value: strings.Join(gpuIds, ","),
  70. },
  71. {
  72. Key: "NVIDIA_DRIVER_CAPABILITIES",
  73. Value: "all",
  74. },
  75. }...)
  76. }
  77. return retEnvs, nil
  78. }
  79. type nvidiaGPU struct {
  80. *BaseDevice
  81. memSize int
  82. gpuIndex int
  83. deviceMinor int
  84. }
  85. func (dev *nvidiaGPU) GetNvidiaDevMemSize() int {
  86. return dev.memSize
  87. }
  88. func (dev *nvidiaGPU) GetNvidiaDevIndex() string {
  89. return fmt.Sprintf("%d", dev.gpuIndex)
  90. }
  91. func (dev *nvidiaGPU) GetIndex() int {
  92. return dev.gpuIndex
  93. }
  94. func (dev *nvidiaGPU) GetDeviceMinor() int {
  95. return dev.deviceMinor
  96. }
  97. func probeNvidiaGpus() ([]isolated_device.IDevice, error) {
  98. if nvidiaGpuUsages != nil {
  99. res := make([]isolated_device.IDevice, 0)
  100. for pciAddr, dev := range nvidiaGpuUsages {
  101. if dev.Used {
  102. continue
  103. }
  104. res = append(res, nvidiaGpuUsages[pciAddr].nvidiaGPU)
  105. }
  106. nvidiaGpuUsages = nil
  107. return res, nil
  108. }
  109. devs, err := getNvidiaGPUs()
  110. if err != nil {
  111. return nil, err
  112. }
  113. res := make([]isolated_device.IDevice, 0)
  114. for i := range devs {
  115. res = append(res, devs[i])
  116. }
  117. return res, nil
  118. }
  119. func getNvidiaGPUs() ([]*nvidiaGPU, error) {
  120. devs := make([]*nvidiaGPU, 0)
  121. // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id --format=csv
  122. // uuid, name, pci.bus_id
  123. // GPU-bc1a3bb9-55cb-8c52-c374-4f8b4f388a20, NVIDIA A800-SXM4-80GB, 00000000:10:00.0
  124. // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode --format=csv
  125. out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,compute_mode,memory.total,index", "--format=csv").Output()
  126. if err != nil {
  127. return nil, errors.Wrap(err, "nvidia-smi")
  128. }
  129. lines := strings.Split(string(out), "\n")
  130. for _, line := range lines {
  131. if strings.HasPrefix(line, "uuid") {
  132. continue
  133. }
  134. segs := strings.Split(line, ",")
  135. if len(segs) != 6 {
  136. log.Errorf("unknown nvidia-smi out line %s", line)
  137. continue
  138. }
  139. gpuId, gpuName, gpuPciAddr, computeMode, memTotal, index := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]), strings.TrimSpace(segs[4]), strings.TrimSpace(segs[5])
  140. if computeMode != "Default" {
  141. log.Warningf("gpu device %s compute mode %s, skip.", gpuId, computeMode)
  142. continue
  143. }
  144. indexInt, err := parseInt(index)
  145. if err != nil {
  146. return nil, errors.Wrapf(err, "failed parse index %s", index)
  147. }
  148. memSize, err := parseMemSize(memTotal)
  149. if err != nil {
  150. return nil, errors.Wrapf(err, "failed parse memSize %s", memTotal)
  151. }
  152. pciOutput, err := isolated_device.GetPCIStrByAddr(gpuPciAddr)
  153. if err != nil {
  154. return nil, errors.Wrapf(err, "GetPCIStrByAddr %s", gpuPciAddr)
  155. }
  156. dev := isolated_device.NewPCIDevice2(pciOutput[0])
  157. driverInfoPath := fmt.Sprintf("/proc/driver/nvidia/gpus/0000:%s/information", dev.Addr)
  158. driverContent, err := procutils.NewRemoteCommandAsFarAsPossible("cat", driverInfoPath).Output()
  159. if err != nil {
  160. return nil, errors.Wrapf(err, "failed get driver content from: %s", driverInfoPath)
  161. }
  162. driverInfo, err := parseNvidiaGPUDriverInformation(string(driverContent))
  163. if err != nil {
  164. return nil, errors.Wrapf(err, "failed parse driver content from: %s", driverInfoPath)
  165. }
  166. gpuDev := &nvidiaGPU{
  167. BaseDevice: NewBaseDevice(dev, isolated_device.ContainerDeviceTypeNvidiaGpu, gpuId),
  168. memSize: memSize,
  169. gpuIndex: indexInt,
  170. deviceMinor: driverInfo.DeviceMinor,
  171. }
  172. gpuDev.SetModelName(gpuName)
  173. devs = append(devs, gpuDev)
  174. }
  175. if len(devs) == 0 {
  176. return nil, nil
  177. }
  178. return devs, nil
  179. }
  180. type NvidiaGPUDriverInformation struct {
  181. Model string
  182. IRQ int
  183. UUID string
  184. VideoBIOS string
  185. BusType string
  186. DMASize string
  187. DMAMask string
  188. BusLocation string
  189. DeviceMinor int
  190. Firmware string
  191. Excluded bool
  192. }
  193. // parseNvidiaGPUDriverInformation 解析下面文件的内容
  194. // cat /proc/driver/nvidia/gpus/0000\:61\:00.0/information
  195. // Model: NVIDIA GeForce RTX 4060
  196. // IRQ: 483
  197. // GPU UUID: GPU-2e1ab7a2-fda6-8b93-eba2-fa59e6135199
  198. // Video BIOS: 95.07.36.00.04
  199. // Bus Type: PCIe
  200. // DMA Size: 47 bits
  201. // DMA Mask: 0x7fffffffffff
  202. // Bus Location: 0000:61:00.0
  203. // Device Minor: 0
  204. // GPU Firmware: 570.133.07
  205. // GPU Excluded: No
  206. func parseNvidiaGPUDriverInformation(content string) (*NvidiaGPUDriverInformation, error) {
  207. info := &NvidiaGPUDriverInformation{}
  208. lines := strings.Split(content, "\n")
  209. for _, line := range lines {
  210. line = strings.TrimSpace(line)
  211. if line == "" {
  212. continue
  213. }
  214. // 解析 key: value 格式
  215. parts := strings.SplitN(line, ":", 2)
  216. if len(parts) != 2 {
  217. continue
  218. }
  219. key := strings.TrimSpace(parts[0])
  220. value := strings.TrimSpace(parts[1])
  221. switch key {
  222. case "Model":
  223. info.Model = value
  224. case "IRQ":
  225. if irq, err := parseInt(value); err != nil {
  226. return nil, errors.Wrapf(err, "failed parse IRQ %s", value)
  227. } else {
  228. info.IRQ = irq
  229. }
  230. case "GPU UUID":
  231. info.UUID = value
  232. case "Video BIOS":
  233. info.VideoBIOS = value
  234. case "Bus Type":
  235. info.BusType = value
  236. case "DMA Size":
  237. info.DMASize = value
  238. case "DMA Mask":
  239. info.DMAMask = value
  240. case "Bus Location":
  241. info.BusLocation = value
  242. case "Device Minor":
  243. if minor, err := parseInt(value); err != nil {
  244. return nil, errors.Wrapf(err, "failed parse Device Minor %s", value)
  245. } else {
  246. info.DeviceMinor = minor
  247. }
  248. case "GPU Firmware":
  249. info.Firmware = value
  250. case "GPU Excluded":
  251. info.Excluded = (value != "No")
  252. }
  253. }
  254. return info, nil
  255. }
  256. func parseInt(s string) (int, error) {
  257. stringValue := strings.TrimSpace(s)
  258. return strconv.Atoi(stringValue)
  259. }