install-nvidia-driver.yml 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. - name: Get current running kernel version
  2. shell: uname -r
  3. register: current_kernel_version
  4. changed_when: false
  5. - name: Install kernel headers, development packages and build tools
  6. include_tasks: "install-nvidia-driver-{{ ansible_os_family }}.yml"
  7. # 禁用 Nouveau 驱动
  8. - name: Blacklist nouveau driver
  9. copy:
  10. content: |
  11. blacklist nouveau
  12. options nouveau modeset=0
  13. dest: /etc/modprobe.d/blacklist-nouveau.conf
  14. mode: '0644'
  15. become: true
  16. register: nouveau_blacklisted
  17. - name: Regenerate initramfs after blacklisting nouveau (RedHat)
  18. shell: dracut --force
  19. become: true
  20. args:
  21. executable: /bin/bash
  22. when: nouveau_blacklisted.changed and ansible_os_family == "RedHat"
  23. - name: Regenerate initramfs after blacklisting nouveau (Debian)
  24. shell: update-initramfs -u
  25. become: true
  26. args:
  27. executable: /bin/bash
  28. when: nouveau_blacklisted.changed and ansible_os_family == "Debian"
  29. # 清理vfio相关配置和grub参数,重建grub并重启
  30. - name: 删除 /etc/modules-load.d/vfio.conf
  31. file:
  32. path: /etc/modules-load.d/vfio.conf
  33. state: absent
  34. become: true
  35. - name: 删除 /etc/modprobe.d/vfio.conf
  36. file:
  37. path: /etc/modprobe.d/vfio.conf
  38. state: absent
  39. become: true
  40. - name: 删除 /etc/modprobe.d/blacklist-gpu.conf
  41. file:
  42. path: /etc/modprobe.d/blacklist-gpu.conf
  43. state: absent
  44. become: true
  45. - name: 删除 /usr/bin/vfio-pci-override.sh
  46. file:
  47. path: /usr/bin/vfio-pci-override.sh
  48. state: absent
  49. become: true
  50. - name: 移除 /etc/default/grub 里的 vfio_iommu_type1.allow_unsafe_interrupts
  51. replace:
  52. path: /etc/default/grub
  53. regexp: ' vfio_iommu_type1.allow_unsafe_interrupts(=\S*)?'
  54. replace: ''
  55. become: true
  56. register: grub_vfio_removed
  57. - name: 移除 /etc/default/grub 里的 iommu=pt
  58. replace:
  59. path: /etc/default/grub
  60. regexp: ' iommu=pt'
  61. replace: ''
  62. become: true
  63. register: grub_iommu_removed
  64. - name: 重新生成 grub 配置
  65. include_tasks: "configure-grub-{{ ansible_os_family }}.yml"
  66. when: grub_vfio_removed.changed or grub_iommu_removed.changed
  67. - name: 重启系统
  68. reboot:
  69. msg: "Reboot initiated by Ansible after vfio cleanup and grub update"
  70. connect_timeout: 5
  71. reboot_timeout: 600
  72. pre_reboot_delay: 0
  73. post_reboot_delay: 30
  74. test_command: whoami
  75. become: true
  76. when: kernel_install_result.changed or nouveau_blacklisted.changed or grub_vfio_removed.changed or grub_iommu_removed.changed
  77. - name: Check if NVIDIA driver is already installed
  78. shell: nvidia-smi
  79. register: nvidia_check
  80. ignore_errors: true
  81. failed_when: false
  82. - name: Set NVIDIA driver installation flag
  83. set_fact:
  84. nvidia_driver_installed: "{{ nvidia_check.rc == 0 }}"
  85. - block:
  86. - name: Create NVIDIA installation directory
  87. file:
  88. path: /opt/nvidia
  89. state: directory
  90. mode: '0755'
  91. become: true
  92. - name: Extract NVIDIA driver installer filename
  93. set_fact:
  94. nvidia_driver_installer: "{{ nvidia_driver_installer_path | basename }}"
  95. - name: Ensure rsync is installed on remote host
  96. package:
  97. name: rsync
  98. state: present
  99. become: true
  100. - name: Copy NVIDIA driver installer to remote host using synchronize
  101. ansible.builtin.synchronize:
  102. src: "{{ nvidia_driver_installer_path }}"
  103. dest: "/opt/nvidia/{{ nvidia_driver_installer }}"
  104. mode: push
  105. delegate_to: localhost
  106. become: false
  107. when: nvidia_driver_installer_path is defined
  108. - name: Find kernel source path
  109. shell: |
  110. if [ -d /usr/src/kernels/{{ current_kernel_version.stdout }} ]; then
  111. echo /usr/src/kernels/{{ current_kernel_version.stdout }}
  112. elif [ -d /usr/src/linux-headers-{{ current_kernel_version.stdout }} ]; then
  113. echo /usr/src/linux-headers-{{ current_kernel_version.stdout }}
  114. elif [ -d /lib/modules/{{ current_kernel_version.stdout }}/build ]; then
  115. echo /lib/modules/{{ current_kernel_version.stdout }}/build
  116. else
  117. echo ""
  118. fi
  119. register: kernel_source_path_result
  120. changed_when: false
  121. failed_when: false
  122. - name: Set kernel source path
  123. set_fact:
  124. kernel_source_path: "{{ kernel_source_path_result.stdout }}"
  125. - name: Install NVIDIA driver with kernel source path
  126. shell: |
  127. cd /opt/nvidia
  128. chmod +x {{ nvidia_driver_installer }}
  129. ./{{ nvidia_driver_installer }} --silent --accept-license --no-questions --kernel-source-path={{ kernel_source_path }}
  130. become: true
  131. args:
  132. executable: /bin/bash
  133. register: nvidia_install_result
  134. when: kernel_source_path | default('') != ''
  135. - name: Install NVIDIA driver without kernel source path
  136. shell: |
  137. cd /opt/nvidia
  138. chmod +x {{ nvidia_driver_installer }}
  139. ./{{ nvidia_driver_installer }} --silent --accept-license --no-questions
  140. become: true
  141. args:
  142. executable: /bin/bash
  143. register: nvidia_install_result
  144. when: kernel_source_path | default('') == ''
  145. - debug: var=nvidia_install_result.stdout_lines
  146. when: not nvidia_driver_installed | default(false)