add_node.py 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import socket
  4. from . import consts
  5. from .service import AddNodeService, AddNodesConfig
  6. from .cluster import construct_cluster
  7. from .parser import inject_add_nodes_runtime_options, inject_ai_nvidia_options
  8. from .utils import is_ipv4, is_ipv6
  9. class AddWorkerNodeService(AddNodeService):
  10. def inject_options(self, parser):
  11. super(AddWorkerNodeService, self).inject_options(parser)
  12. inject_add_nodes_runtime_options(parser)
  13. inject_ai_nvidia_options(parser)
  14. parser.add_argument("--enable-ai-env",
  15. dest="enable_ai_env",
  16. action="store_true",
  17. default=False,
  18. help="enable AI environment on the new node (NVIDIA driver/CUDA, containerd device mapping). Implies --runtime containerd.")
  19. def do_action(self, args):
  20. cluster = construct_cluster(
  21. args.primary_master_host,
  22. args.ssh_user,
  23. args.ssh_private_file,
  24. args.ssh_port)
  25. if args.ip_type == '':
  26. if is_ipv4(args.primary_master_host):
  27. args.ip_type = consts.IP_TYPE_IPV4
  28. elif is_ipv6(args.primary_master_host):
  29. args.ip_type = consts.IP_TYPE_IPV6
  30. else:
  31. raise ValueError("ip type is not set and cannot be determined from primary master host")
  32. # AI 环境必须使用 containerd;若用户显式指定了 --runtime qemu 则报错
  33. if getattr(args, 'enable_ai_env', False):
  34. if args.runtime == consts.RUNTIME_QEMU:
  35. raise ValueError("AI 环境必须使用 containerd 运行时,不能与 --runtime qemu 同时使用。请去掉 --runtime qemu 或改用 containerd。")
  36. args.runtime = consts.RUNTIME_CONTAINERD
  37. # 如果未指定 runtime,使用默认值 qemu
  38. if args.runtime is None:
  39. args.runtime = consts.RUNTIME_QEMU
  40. # 处理双栈配置
  41. kwargs = {
  42. 'runtime': args.runtime,
  43. 'host_networks': args.host_networks,
  44. 'disk_paths': args.disk_paths,
  45. 'ip_dual_conf': getattr(args, 'ip_dual_conf', None),
  46. 'ip_type': args.ip_type,
  47. 'offline_data_path': args.offline_data_path,
  48. 'enable_ai_env': getattr(args, 'enable_ai_env', False),
  49. 'gpu_device_virtual_number': getattr(args, 'gpu_device_virtual_number', 2),
  50. 'nvidia_driver_installer_path': getattr(args, 'nvidia_driver_installer_path', None),
  51. 'cuda_installer_path': getattr(args, 'cuda_installer_path', None),
  52. }
  53. # 如果是双栈配置,需要处理IPv4和IPv6地址
  54. if args.ip_type == consts.IP_TYPE_DUAL_STACK and hasattr(args, 'ip_dual_conf') and args.ip_dual_conf:
  55. # 确定哪个是IPv4,哪个是IPv6
  56. if is_ipv4(args.target_node_hosts[0]):
  57. # 主IP是IPv4,ip_dual_conf是IPv6
  58. kwargs['node_ip_v4'] = args.target_node_hosts[0]
  59. kwargs['node_ip_v6'] = args.ip_dual_conf
  60. else:
  61. # 主IP是IPv6,ip_dual_conf是IPv4
  62. kwargs['node_ip_v4'] = args.ip_dual_conf
  63. kwargs['node_ip_v6'] = args.target_node_hosts[0]
  64. config = AddNodesConfig(cluster,
  65. args.target_node_hosts,
  66. args.ssh_user,
  67. args.ssh_private_file,
  68. args.ssh_port,
  69. args.ssh_node_port,
  70. enable_host_on_vm=True,
  71. **kwargs)
  72. return config.run()
  73. def add_command(subparsers):
  74. AddWorkerNodeService(subparsers, "add-node", "add new node into cluster")