vastaitech_gpu.go 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package container_device
  15. import (
  16. "fmt"
  17. "os"
  18. "path"
  19. "strconv"
  20. "strings"
  21. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
  22. "yunion.io/x/pkg/errors"
  23. hostapi "yunion.io/x/onecloud/pkg/apis/host"
  24. "yunion.io/x/onecloud/pkg/hostman/isolated_device"
  25. fileutils "yunion.io/x/onecloud/pkg/util/fileutils2"
  26. )
  27. func init() {
  28. isolated_device.RegisterContainerDeviceManager(newVastaitechGPUManager())
  29. }
  30. type vastaitechGPUManager struct{}
  31. func newVastaitechGPUManager() isolated_device.IContainerDeviceManager {
  32. return &vastaitechGPUManager{}
  33. }
  34. func (v vastaitechGPUManager) GetType() isolated_device.ContainerDeviceType {
  35. return isolated_device.ContainerDeviceTypeVastaitechGpu
  36. }
  37. const (
  38. VASTAITECH_VA_CTL = "va_ctl"
  39. VASTAITECH_VA_VIDEO = "va_video"
  40. VASTAITECH_VACC = "vacc"
  41. )
  42. var vastaitechRelatedDevices = map[string]string{
  43. VASTAITECH_VA_CTL: "/dev/va%d_ctl",
  44. VASTAITECH_VA_VIDEO: "/dev/va_video%d",
  45. VASTAITECH_VACC: "/dev/vacc%d",
  46. }
  47. func (v vastaitechGPUManager) getRelatedDevices(index int) map[string]string {
  48. devs := make(map[string]string)
  49. for key, devFmt := range vastaitechRelatedDevices {
  50. devs[key] = fmt.Sprintf(devFmt, index)
  51. }
  52. return devs
  53. }
  54. func (v vastaitechGPUManager) getDriRenderPrefix() string {
  55. return "/dev/dri/renderD"
  56. }
  57. // getVastaitechDriStartIndexFromByPath 扫描 /dev/dri/by-path/,找到名称含 va_card 的 -render 链接对应的最小 renderD 编号
  58. func (v vastaitechGPUManager) getVastaitechDriStartIndexFromByPath() (int, error) {
  59. const byPathDir = "/dev/dri/by-path"
  60. entries, err := os.ReadDir(byPathDir)
  61. if err != nil {
  62. return 0, errors.Wrapf(err, "read %s", byPathDir)
  63. }
  64. const renderDSuffix = "renderD"
  65. var minIdx *int
  66. for _, entry := range entries {
  67. entryName := entry.Name()
  68. if !strings.Contains(entryName, "va_card") || !strings.HasSuffix(entryName, "-render") {
  69. continue
  70. }
  71. fp := path.Join(byPathDir, entryName)
  72. linkPath, err := os.Readlink(fp)
  73. if err != nil {
  74. return 0, errors.Wrapf(err, "readlink %s", fp)
  75. }
  76. // linkPath is e.g. "../renderD129"
  77. base := path.Base(linkPath)
  78. if !strings.HasPrefix(base, renderDSuffix) {
  79. continue
  80. }
  81. idxStr := strings.TrimPrefix(base, renderDSuffix)
  82. driIdx, err := strconv.Atoi(idxStr)
  83. if err != nil {
  84. return 0, errors.Wrapf(err, "parse render index from %s", linkPath)
  85. }
  86. if minIdx == nil || driIdx < *minIdx {
  87. minIdx = &driIdx
  88. }
  89. }
  90. if minIdx == nil {
  91. return 0, errors.Errorf("no va_card render device found in %s", byPathDir)
  92. }
  93. return *minIdx, nil
  94. }
  95. func (v vastaitechGPUManager) getDriStartIndex() (int, error) {
  96. return v.getVastaitechDriStartIndexFromByPath()
  97. }
  98. func (v vastaitechGPUManager) getRelatedDeviceStartIndex(driPath string) (int, error) {
  99. prefix := v.getDriRenderPrefix()
  100. if !strings.HasPrefix(driPath, prefix) {
  101. return -1, errors.Errorf("device path %q doesn't start with /dev/dri/renderD", driPath)
  102. }
  103. idxStr := strings.ReplaceAll(driPath, prefix, "")
  104. driIdx, err := strconv.Atoi(idxStr)
  105. if err != nil {
  106. return -1, errors.Wrapf(err, "convert %s to int", idxStr)
  107. }
  108. startIndex, err := v.getDriStartIndex()
  109. if err != nil {
  110. return -1, err
  111. }
  112. idx := driIdx - startIndex
  113. if idx < 0 {
  114. return -1, errors.Errorf("%s index is less than %d", driPath, startIndex)
  115. }
  116. return idx, nil
  117. }
  118. func (v vastaitechGPUManager) NewDevices(dev *isolated_device.ContainerDevice) ([]isolated_device.IDevice, error) {
  119. idx, err := v.getRelatedDeviceStartIndex(dev.Path)
  120. if err != nil {
  121. return nil, errors.Wrap(err, "get related device start index")
  122. }
  123. // check related devices
  124. for _, devPath := range v.getRelatedDevices(idx) {
  125. if !fileutils.Exists(devPath) {
  126. return nil, errors.Wrapf(errors.ErrNotFound, "related device %s not found of %s", devPath, dev.Path)
  127. }
  128. }
  129. if err := CheckVirtualNumber(dev); err != nil {
  130. return nil, err
  131. }
  132. gpuDevs := make([]isolated_device.IDevice, 0)
  133. for i := 0; i < dev.VirtualNumber; i++ {
  134. gpuDev, err := newVastaitechGPU(dev.Path, i)
  135. if err != nil {
  136. return nil, errors.Wrapf(err, "new CPH AMD GPU with index %d", i)
  137. }
  138. gpuDevs = append(gpuDevs, gpuDev)
  139. }
  140. return gpuDevs, nil
  141. }
  142. func (v vastaitechGPUManager) getCommonDevices() []*runtimeapi.Device {
  143. vatools := "/dev/vatools"
  144. vaSync := "/dev/va_sync"
  145. devs := []*runtimeapi.Device{}
  146. for _, devPath := range []string{vatools, vaSync} {
  147. devs = append(devs, &runtimeapi.Device{
  148. ContainerPath: devPath,
  149. HostPath: devPath,
  150. Permissions: "rwm",
  151. })
  152. }
  153. return devs
  154. }
  155. func (v vastaitechGPUManager) NewContainerDevices(input *hostapi.ContainerCreateInput, dev *hostapi.ContainerDevice) ([]*runtimeapi.Device, []*runtimeapi.Device, error) {
  156. driHostPath := dev.IsolatedDevice.Path
  157. idx, err := v.getRelatedDeviceStartIndex(driHostPath)
  158. if err != nil {
  159. return nil, nil, errors.Wrapf(err, "get related device start index by %s", driHostPath)
  160. }
  161. perms := "rwm"
  162. devs := []*runtimeapi.Device{
  163. {
  164. HostPath: driHostPath,
  165. ContainerPath: driHostPath,
  166. Permissions: perms,
  167. },
  168. }
  169. for _, devPath := range v.getRelatedDevices(idx) {
  170. devs = append(devs, &runtimeapi.Device{
  171. HostPath: devPath,
  172. ContainerPath: devPath,
  173. Permissions: perms,
  174. })
  175. }
  176. return devs, v.getCommonDevices(), nil
  177. }
  178. func (v vastaitechGPUManager) ProbeDevices() ([]isolated_device.IDevice, error) {
  179. return nil, nil
  180. }
  181. func (v vastaitechGPUManager) GetContainerExtraConfigures(devs []*hostapi.ContainerDevice) ([]*runtimeapi.KeyValue, []*runtimeapi.Mount) {
  182. return nil, nil
  183. }
  184. type vastaitechGPU struct {
  185. *BaseDevice
  186. }
  187. func newVastaitechGPU(devPath string, index int) (*vastaitechGPU, error) {
  188. dev, err := newPCIGPURenderBaseDevice(devPath, index, isolated_device.ContainerDeviceTypeVastaitechGpu)
  189. if err != nil {
  190. return nil, errors.Wrap(err, "new PCIGPURenderBaseDevice")
  191. }
  192. return &vastaitechGPU{BaseDevice: dev}, nil
  193. }