ascend_npu.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package container_device
  15. import (
  16. "fmt"
  17. "strconv"
  18. "strings"
  19. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
  20. "yunion.io/x/log"
  21. "yunion.io/x/pkg/errors"
  22. hostapi "yunion.io/x/onecloud/pkg/apis/host"
  23. "yunion.io/x/onecloud/pkg/hostman/isolated_device"
  24. "yunion.io/x/onecloud/pkg/util/procutils"
  25. )
  26. func init() {
  27. isolated_device.RegisterContainerDeviceManager(newAscendNPUManager())
  28. }
  29. type ascendNPUManager struct{}
  30. func (m *ascendNPUManager) GetContainerExtraConfigures(devs []*hostapi.ContainerDevice) ([]*runtimeapi.KeyValue, []*runtimeapi.Mount) {
  31. npus := []string{}
  32. for _, dev := range devs {
  33. if dev.IsolatedDevice == nil {
  34. continue
  35. }
  36. if isolated_device.ContainerDeviceType(dev.IsolatedDevice.DeviceType) != isolated_device.ContainerDeviceTypeAscendNpu {
  37. continue
  38. }
  39. npus = append(npus, dev.IsolatedDevice.Path)
  40. }
  41. if len(npus) == 0 {
  42. return nil, nil
  43. }
  44. var (
  45. ASCEND_TOOLKIT_HOME = "/usr/local/Ascend/ascend-toolkit/latest"
  46. LD_LIBRARY_PATH = fmt.Sprintf("/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:"+
  47. "%s/lib64:%s/lib64/plugin/opskernel:%s/lib64/plugin/nnengine", ASCEND_TOOLKIT_HOME, ASCEND_TOOLKIT_HOME, ASCEND_TOOLKIT_HOME)
  48. ASCEND_AICPU_PATH = ASCEND_TOOLKIT_HOME
  49. ASCEND_OPP_PATH = fmt.Sprintf("%s/opp", ASCEND_TOOLKIT_HOME)
  50. TOOLCHAIN_HOME = fmt.Sprintf("%s/toolkit", ASCEND_TOOLKIT_HOME)
  51. ASCEND_HOME_PATH = ASCEND_AICPU_PATH
  52. )
  53. return []*runtimeapi.KeyValue{
  54. {
  55. Key: "ASCEND_TOOLKIT_HOME",
  56. Value: ASCEND_TOOLKIT_HOME,
  57. }, {
  58. Key: "LD_LIBRARY_PATH",
  59. Value: LD_LIBRARY_PATH,
  60. }, {
  61. Key: "ASCEND_AICPU_PATH",
  62. Value: ASCEND_AICPU_PATH,
  63. }, {
  64. Key: "ASCEND_OPP_PATH",
  65. Value: ASCEND_OPP_PATH,
  66. }, {
  67. Key: "TOOLCHAIN_HOME",
  68. Value: TOOLCHAIN_HOME,
  69. }, {
  70. Key: "ASCEND_HOME_PATH",
  71. Value: ASCEND_HOME_PATH,
  72. },
  73. }, []*runtimeapi.Mount{
  74. {
  75. ContainerPath: "/usr/local/Ascend",
  76. HostPath: "/usr/local/Ascend",
  77. Readonly: true,
  78. },
  79. {
  80. ContainerPath: "/usr/local/dcmi",
  81. HostPath: "/usr/local/dcmi",
  82. Readonly: true,
  83. },
  84. {
  85. ContainerPath: "/usr/local/bin/npu-smi",
  86. HostPath: "/usr/local/bin/npu-smi",
  87. Readonly: true,
  88. },
  89. }
  90. }
  91. func newAscendNPUManager() *ascendNPUManager {
  92. return &ascendNPUManager{}
  93. }
  94. func (m *ascendNPUManager) ProbeDevices() ([]isolated_device.IDevice, error) {
  95. return getAscendNpus()
  96. }
  97. func (m *ascendNPUManager) NewDevices(dev *isolated_device.ContainerDevice) ([]isolated_device.IDevice, error) {
  98. return nil, nil
  99. }
  100. func (m *ascendNPUManager) NewContainerDevices(input *hostapi.ContainerCreateInput, dev *hostapi.ContainerDevice) ([]*runtimeapi.Device, []*runtimeapi.Device, error) {
  101. return []*runtimeapi.Device{
  102. &runtimeapi.Device{
  103. ContainerPath: dev.IsolatedDevice.Path,
  104. HostPath: dev.IsolatedDevice.Path,
  105. Permissions: "rwm",
  106. },
  107. }, []*runtimeapi.Device{
  108. &runtimeapi.Device{
  109. ContainerPath: "/dev/davinci_manager",
  110. HostPath: "/dev/davinci_manager",
  111. Permissions: "rwm",
  112. },
  113. &runtimeapi.Device{
  114. ContainerPath: "/dev/devmm_svm",
  115. HostPath: "/dev/devmm_svm",
  116. Permissions: "rwm",
  117. },
  118. &runtimeapi.Device{
  119. ContainerPath: "/dev/hisi_hdc",
  120. HostPath: "/dev/hisi_hdc",
  121. Permissions: "rwm",
  122. },
  123. }, nil
  124. }
  125. func (m *ascendNPUManager) GetType() isolated_device.ContainerDeviceType {
  126. return isolated_device.ContainerDeviceTypeAscendNpu
  127. }
  128. type ascnedNPU struct {
  129. *BaseDevice
  130. }
  131. func getAscendNpus() ([]isolated_device.IDevice, error) {
  132. devs := make([]isolated_device.IDevice, 0)
  133. // Show all device's topology information
  134. out, err := procutils.NewRemoteCommandAsFarAsPossible("npu-smi", "info").Output()
  135. if err != nil {
  136. return nil, errors.Wrap(err, "npu-smi")
  137. }
  138. lines := strings.Split(string(out), "\n")
  139. for i := 6; i < len(lines); i += 3 {
  140. if !strings.HasPrefix(lines[i], "|") {
  141. continue
  142. }
  143. if len(lines) <= (i + 1) {
  144. return nil, errors.Errorf("failed parse npu-smi unknown chip line")
  145. }
  146. fields := strings.Fields(lines[i])
  147. if len(fields) < 3 {
  148. return nil, errors.Errorf("failed parse npu-smi unknown npu line")
  149. }
  150. log.Debugf("fields %v", fields)
  151. strNpuID := fields[1]
  152. npuId, err := strconv.Atoi(strNpuID)
  153. if err != nil {
  154. log.Warningf("failed parse npuid %s: %s. break", strNpuID, err)
  155. break
  156. }
  157. npuName := fields[2]
  158. devPath := fmt.Sprintf("/dev/davinci%d", npuId)
  159. fileds2 := strings.Fields(lines[i+1])
  160. if len(fileds2) < 4 {
  161. return nil, errors.Errorf("failed parse npu-smi unknonw chip line get busid")
  162. }
  163. log.Debugf("fileds2 %v", fileds2)
  164. busID := fileds2[3]
  165. pciOutput, err := isolated_device.GetPCIStrByAddr(busID)
  166. if err != nil {
  167. return nil, errors.Wrapf(err, "GetPCIStrByAddr %s", busID)
  168. }
  169. dev := isolated_device.NewPCIDevice2(pciOutput[0])
  170. npuDev := &ascnedNPU{
  171. BaseDevice: NewBaseDevice(dev, isolated_device.ContainerDeviceTypeAscendNpu, devPath),
  172. }
  173. npuDev.SetModelName(npuName)
  174. devs = append(devs, npuDev)
  175. }
  176. if len(devs) == 0 {
  177. return nil, nil
  178. }
  179. return devs, nil
  180. }