nvidia_mps.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package container_device
  15. import (
  16. "fmt"
  17. "strconv"
  18. "strings"
  19. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
  20. "yunion.io/x/log"
  21. "yunion.io/x/pkg/errors"
  22. hostapi "yunion.io/x/onecloud/pkg/apis/host"
  23. "yunion.io/x/onecloud/pkg/hostman/isolated_device"
  24. "yunion.io/x/onecloud/pkg/hostman/options"
  25. "yunion.io/x/onecloud/pkg/util/procutils"
  26. )
  27. // The MPS /dev/shm is needed to allow MPS daemon health-checking
  28. var shmPath = "/dev/shm"
  29. func init() {
  30. isolated_device.RegisterContainerDeviceManager(newNvidiaMPSManager())
  31. }
  32. type nvidiaMPSManager struct{}
  33. func newNvidiaMPSManager() *nvidiaMPSManager {
  34. return &nvidiaMPSManager{}
  35. }
  36. func (m *nvidiaMPSManager) GetType() isolated_device.ContainerDeviceType {
  37. return isolated_device.ContainerDeviceTypeNvidiaMps
  38. }
  39. func (m *nvidiaMPSManager) ProbeDevices() ([]isolated_device.IDevice, error) {
  40. return getNvidiaMPSGpus()
  41. }
  42. func (m *nvidiaMPSManager) NewDevices(dev *isolated_device.ContainerDevice) ([]isolated_device.IDevice, error) {
  43. return nil, nil
  44. }
  45. func (m *nvidiaMPSManager) NewContainerDevices(input *hostapi.ContainerCreateInput, dev *hostapi.ContainerDevice) ([]*runtimeapi.Device, []*runtimeapi.Device, error) {
  46. return nil, nil, nil
  47. }
  48. func (m *nvidiaMPSManager) getMPSPipeDirectory() string {
  49. return options.HostOptions.CudaMPSPipeDirectory
  50. }
  51. func (m *nvidiaMPSManager) getSHMPath() string {
  52. return shmPath
  53. }
  54. func (m *nvidiaMPSManager) GetContainerExtraConfigures(devs []*hostapi.ContainerDevice) ([]*runtimeapi.KeyValue, []*runtimeapi.Mount) {
  55. gpuIds := []string{}
  56. for _, dev := range devs {
  57. if dev.IsolatedDevice == nil {
  58. continue
  59. }
  60. if isolated_device.ContainerDeviceType(dev.IsolatedDevice.DeviceType) != isolated_device.ContainerDeviceTypeNvidiaMps {
  61. continue
  62. }
  63. gpuIds = append(gpuIds, dev.IsolatedDevice.Path)
  64. }
  65. if len(gpuIds) == 0 {
  66. return nil, nil
  67. }
  68. return []*runtimeapi.KeyValue{
  69. {
  70. Key: "CUDA_MPS_PIPE_DIRECTORY",
  71. Value: m.getMPSPipeDirectory(),
  72. },
  73. {
  74. Key: "NVIDIA_VISIBLE_DEVICES",
  75. Value: strings.Join(gpuIds, ","),
  76. },
  77. {
  78. Key: "NVIDIA_DRIVER_CAPABILITIES",
  79. Value: "all",
  80. },
  81. }, []*runtimeapi.Mount{
  82. {
  83. ContainerPath: m.getSHMPath(),
  84. HostPath: m.getSHMPath(),
  85. },
  86. {
  87. ContainerPath: m.getMPSPipeDirectory(),
  88. HostPath: m.getMPSPipeDirectory(),
  89. },
  90. }
  91. }
  92. type nvidiaMPS struct {
  93. *BaseDevice
  94. MemSizeMB int
  95. MemTotalMB int
  96. ThreadPercentage int
  97. gpuIndex string
  98. }
  99. func (dev *nvidiaMPS) GetNvidiaDevMemSize() int {
  100. return dev.MemSizeMB
  101. }
  102. func (dev *nvidiaMPS) GetNvidiaDevIndex() string {
  103. return dev.gpuIndex
  104. }
  105. func (c *nvidiaMPS) GetNvidiaMpsMemoryLimit() int {
  106. return c.MemSizeMB
  107. }
  108. func (c *nvidiaMPS) GetNvidiaMpsMemoryTotal() int {
  109. return c.MemTotalMB
  110. }
  111. func (c *nvidiaMPS) GetNvidiaMpsThreadPercentage() int {
  112. return c.ThreadPercentage
  113. }
  114. func parseMemSize(memTotalStr string) (int, error) {
  115. if !strings.HasSuffix(memTotalStr, " MiB") {
  116. return -1, errors.Errorf("unknown mem string suffix")
  117. }
  118. memStr := strings.TrimSpace(strings.TrimSuffix(memTotalStr, " MiB"))
  119. return strconv.Atoi(memStr)
  120. }
  121. func getNvidiaMPSGpus() ([]isolated_device.IDevice, error) {
  122. devs := make([]isolated_device.IDevice, 0)
  123. // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode --format=csv
  124. // GPU-76aef7ff-372d-2432-b4b4-beca4d8d3400, Tesla P40, 00000000:00:08.0, 23040 MiB, Exclusive_Process
  125. out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode,index", "--format=csv").Output()
  126. if err != nil {
  127. return nil, errors.Wrap(err, "nvidia-smi")
  128. }
  129. lines := strings.Split(string(out), "\n")
  130. for _, line := range lines {
  131. if strings.HasPrefix(line, "uuid") {
  132. continue
  133. }
  134. segs := strings.Split(line, ",")
  135. if len(segs) != 6 {
  136. log.Errorf("unknown nvidia-smi out line %s", line)
  137. continue
  138. }
  139. gpuId, gpuName, gpuPciAddr, memTotal, computeMode, index := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]), strings.TrimSpace(segs[4]), strings.TrimSpace(segs[5])
  140. if computeMode != "Exclusive_Process" {
  141. log.Warningf("gpu device %s compute mode %s, skip.", gpuId, computeMode)
  142. continue
  143. }
  144. memSize, err := parseMemSize(memTotal)
  145. if err != nil {
  146. return nil, errors.Wrapf(err, "failed parse memSize %s", memTotal)
  147. }
  148. pciOutput, err := isolated_device.GetPCIStrByAddr(gpuPciAddr)
  149. if err != nil {
  150. return nil, errors.Wrapf(err, "GetPCIStrByAddr %s", gpuPciAddr)
  151. }
  152. for i := 0; i < options.HostOptions.CudaMPSReplicas; i++ {
  153. dev := isolated_device.NewPCIDevice2(pciOutput[0])
  154. gpuDev := &nvidiaMPS{
  155. BaseDevice: NewBaseDevice(dev, isolated_device.ContainerDeviceTypeNvidiaMps, gpuId),
  156. MemSizeMB: memSize / options.HostOptions.CudaMPSReplicas,
  157. MemTotalMB: memSize,
  158. ThreadPercentage: 100 / options.HostOptions.CudaMPSReplicas,
  159. gpuIndex: index,
  160. }
  161. gpuDev.SetModelName(gpuName)
  162. devAddr := gpuDev.GetAddr()
  163. gpuDev.SetAddr(fmt.Sprintf("%s-%d", devAddr, i), devAddr)
  164. devs = append(devs, gpuDev)
  165. }
  166. }
  167. if len(devs) == 0 {
  168. return nil, nil
  169. }
  170. return devs, nil
  171. }