| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293 |
- // Copyright 2019 Yunion
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package container_device
- import (
- "fmt"
- "strconv"
- "strings"
- runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
- "yunion.io/x/log"
- "yunion.io/x/pkg/errors"
- "yunion.io/x/pkg/util/sets"
- hostapi "yunion.io/x/onecloud/pkg/apis/host"
- "yunion.io/x/onecloud/pkg/hostman/isolated_device"
- "yunion.io/x/onecloud/pkg/util/procutils"
- )
- func init() {
- isolated_device.RegisterContainerDeviceManager(newNvidiaGPUManager())
- }
- type nvidiaGPUManager struct{}
- func newNvidiaGPUManager() *nvidiaGPUManager {
- return &nvidiaGPUManager{}
- }
- func (m *nvidiaGPUManager) GetType() isolated_device.ContainerDeviceType {
- return isolated_device.ContainerDeviceTypeNvidiaGpu
- }
- func (m *nvidiaGPUManager) ProbeDevices() ([]isolated_device.IDevice, error) {
- return probeNvidiaGpus()
- }
- func (m *nvidiaGPUManager) NewDevices(dev *isolated_device.ContainerDevice) ([]isolated_device.IDevice, error) {
- return nil, nil
- }
- func (m *nvidiaGPUManager) NewContainerDevices(input *hostapi.ContainerCreateInput, dev *hostapi.ContainerDevice) ([]*runtimeapi.Device, []*runtimeapi.Device, error) {
- return nil, nil, nil
- }
- func (m *nvidiaGPUManager) GetContainerExtraConfigures(devs []*hostapi.ContainerDevice) ([]*runtimeapi.KeyValue, []*runtimeapi.Mount) {
- gpuIds := []string{}
- for _, dev := range devs {
- if dev.IsolatedDevice == nil {
- continue
- }
- types := sets.NewString(
- string(isolated_device.ContainerDeviceTypeNvidiaGpu),
- string(isolated_device.ContainerDeviceTypeNvidiaGpuShare),
- )
- if !types.Has(dev.IsolatedDevice.DeviceType) {
- continue
- }
- gpuIds = append(gpuIds, dev.IsolatedDevice.Path)
- }
- if len(gpuIds) == 0 {
- return nil, nil
- }
- retEnvs := []*runtimeapi.KeyValue{}
- if len(gpuIds) > 0 {
- retEnvs = append(retEnvs, []*runtimeapi.KeyValue{
- {
- Key: "NVIDIA_VISIBLE_DEVICES",
- Value: strings.Join(gpuIds, ","),
- },
- {
- Key: "NVIDIA_DRIVER_CAPABILITIES",
- Value: "all",
- },
- }...)
- }
- return retEnvs, nil
- }
- type nvidiaGPU struct {
- *BaseDevice
- memSize int
- gpuIndex int
- deviceMinor int
- }
- func (dev *nvidiaGPU) GetNvidiaDevMemSize() int {
- return dev.memSize
- }
- func (dev *nvidiaGPU) GetNvidiaDevIndex() string {
- return fmt.Sprintf("%d", dev.gpuIndex)
- }
- func (dev *nvidiaGPU) GetIndex() int {
- return dev.gpuIndex
- }
- func (dev *nvidiaGPU) GetDeviceMinor() int {
- return dev.deviceMinor
- }
- func probeNvidiaGpus() ([]isolated_device.IDevice, error) {
- if nvidiaGpuUsages != nil {
- res := make([]isolated_device.IDevice, 0)
- for pciAddr, dev := range nvidiaGpuUsages {
- if dev.Used {
- continue
- }
- res = append(res, nvidiaGpuUsages[pciAddr].nvidiaGPU)
- }
- nvidiaGpuUsages = nil
- return res, nil
- }
- devs, err := getNvidiaGPUs()
- if err != nil {
- return nil, err
- }
- res := make([]isolated_device.IDevice, 0)
- for i := range devs {
- res = append(res, devs[i])
- }
- return res, nil
- }
- func getNvidiaGPUs() ([]*nvidiaGPU, error) {
- devs := make([]*nvidiaGPU, 0)
- // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id --format=csv
- // uuid, name, pci.bus_id
- // GPU-bc1a3bb9-55cb-8c52-c374-4f8b4f388a20, NVIDIA A800-SXM4-80GB, 00000000:10:00.0
- // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode --format=csv
- out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,compute_mode,memory.total,index", "--format=csv").Output()
- if err != nil {
- return nil, errors.Wrap(err, "nvidia-smi")
- }
- lines := strings.Split(string(out), "\n")
- for _, line := range lines {
- if strings.HasPrefix(line, "uuid") {
- continue
- }
- segs := strings.Split(line, ",")
- if len(segs) != 6 {
- log.Errorf("unknown nvidia-smi out line %s", line)
- continue
- }
- gpuId, gpuName, gpuPciAddr, computeMode, memTotal, index := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]), strings.TrimSpace(segs[4]), strings.TrimSpace(segs[5])
- if computeMode != "Default" {
- log.Warningf("gpu device %s compute mode %s, skip.", gpuId, computeMode)
- continue
- }
- indexInt, err := parseInt(index)
- if err != nil {
- return nil, errors.Wrapf(err, "failed parse index %s", index)
- }
- memSize, err := parseMemSize(memTotal)
- if err != nil {
- return nil, errors.Wrapf(err, "failed parse memSize %s", memTotal)
- }
- pciOutput, err := isolated_device.GetPCIStrByAddr(gpuPciAddr)
- if err != nil {
- return nil, errors.Wrapf(err, "GetPCIStrByAddr %s", gpuPciAddr)
- }
- dev := isolated_device.NewPCIDevice2(pciOutput[0])
- driverInfoPath := fmt.Sprintf("/proc/driver/nvidia/gpus/0000:%s/information", dev.Addr)
- driverContent, err := procutils.NewRemoteCommandAsFarAsPossible("cat", driverInfoPath).Output()
- if err != nil {
- return nil, errors.Wrapf(err, "failed get driver content from: %s", driverInfoPath)
- }
- driverInfo, err := parseNvidiaGPUDriverInformation(string(driverContent))
- if err != nil {
- return nil, errors.Wrapf(err, "failed parse driver content from: %s", driverInfoPath)
- }
- gpuDev := &nvidiaGPU{
- BaseDevice: NewBaseDevice(dev, isolated_device.ContainerDeviceTypeNvidiaGpu, gpuId),
- memSize: memSize,
- gpuIndex: indexInt,
- deviceMinor: driverInfo.DeviceMinor,
- }
- gpuDev.SetModelName(gpuName)
- devs = append(devs, gpuDev)
- }
- if len(devs) == 0 {
- return nil, nil
- }
- return devs, nil
- }
- type NvidiaGPUDriverInformation struct {
- Model string
- IRQ int
- UUID string
- VideoBIOS string
- BusType string
- DMASize string
- DMAMask string
- BusLocation string
- DeviceMinor int
- Firmware string
- Excluded bool
- }
- // parseNvidiaGPUDriverInformation 解析下面文件的内容
- // cat /proc/driver/nvidia/gpus/0000\:61\:00.0/information
- // Model: NVIDIA GeForce RTX 4060
- // IRQ: 483
- // GPU UUID: GPU-2e1ab7a2-fda6-8b93-eba2-fa59e6135199
- // Video BIOS: 95.07.36.00.04
- // Bus Type: PCIe
- // DMA Size: 47 bits
- // DMA Mask: 0x7fffffffffff
- // Bus Location: 0000:61:00.0
- // Device Minor: 0
- // GPU Firmware: 570.133.07
- // GPU Excluded: No
- func parseNvidiaGPUDriverInformation(content string) (*NvidiaGPUDriverInformation, error) {
- info := &NvidiaGPUDriverInformation{}
- lines := strings.Split(content, "\n")
- for _, line := range lines {
- line = strings.TrimSpace(line)
- if line == "" {
- continue
- }
- // 解析 key: value 格式
- parts := strings.SplitN(line, ":", 2)
- if len(parts) != 2 {
- continue
- }
- key := strings.TrimSpace(parts[0])
- value := strings.TrimSpace(parts[1])
- switch key {
- case "Model":
- info.Model = value
- case "IRQ":
- if irq, err := parseInt(value); err != nil {
- return nil, errors.Wrapf(err, "failed parse IRQ %s", value)
- } else {
- info.IRQ = irq
- }
- case "GPU UUID":
- info.UUID = value
- case "Video BIOS":
- info.VideoBIOS = value
- case "Bus Type":
- info.BusType = value
- case "DMA Size":
- info.DMASize = value
- case "DMA Mask":
- info.DMAMask = value
- case "Bus Location":
- info.BusLocation = value
- case "Device Minor":
- if minor, err := parseInt(value); err != nil {
- return nil, errors.Wrapf(err, "failed parse Device Minor %s", value)
- } else {
- info.DeviceMinor = minor
- }
- case "GPU Firmware":
- info.Firmware = value
- case "GPU Excluded":
- info.Excluded = (value != "No")
- }
- }
- return info, nil
- }
- func parseInt(s string) (int, error) {
- stringValue := strings.TrimSpace(s)
- return strconv.Atoi(stringValue)
- }
|