check-gpu-and-driver.yml 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. # 部署前检查:目标主机是否存在 NVIDIA GPU,以及驱动/CUDA 是否已安装或已提供安装包路径
  2. - name: Check if NVIDIA GPU is present on target host
  3. shell: lspci | grep -i nvidia
  4. register: nvidia_gpu_check
  5. changed_when: false
  6. failed_when: false
  7. - name: Fail when no NVIDIA GPU detected
  8. fail:
  9. msg: "本机未检测到 NVIDIA GPU,无法部署 AI 环境。请使用带 NVIDIA GPU 的机器或提供正确的目标主机。"
  10. when: nvidia_gpu_check.rc != 0 or (nvidia_gpu_check.stdout | default('') | trim | length == 0)
  11. - name: Check if NVIDIA driver is already installed
  12. shell: nvidia-smi
  13. register: nvidia_driver_check
  14. changed_when: false
  15. failed_when: false
  16. - name: Set NVIDIA driver installation flag
  17. set_fact:
  18. nvidia_driver_installed: "{{ nvidia_driver_check.rc == 0 }}"
  19. - name: Check if CUDA is already installed
  20. shell: /usr/local/cuda/bin/nvcc --version
  21. register: cuda_check
  22. changed_when: false
  23. failed_when: false
  24. - name: Set CUDA installation flag
  25. set_fact:
  26. cuda_installed: "{{ cuda_check.rc == 0 }}"
  27. - name: Fail when driver not installed and no installer path provided
  28. fail:
  29. msg: "未检测到已安装的 NVIDIA 驱动,且未提供 --nvidia-driver-installer-path。请先安装驱动或通过参数指定安装包路径。"
  30. when: not nvidia_driver_installed | default(false) and nvidia_driver_installer_path is not defined
  31. - name: Fail when CUDA not installed and no installer path provided
  32. fail:
  33. msg: "未检测到已安装的 CUDA,且未提供 --cuda-installer-path。请先安装 CUDA 或通过参数指定安装包路径。"
  34. when: not cuda_installed | default(false) and cuda_installer_path is not defined