This commit is contained in:
havelight-ee
2023-05-30 14:44:26 +09:00
parent 9a3174deef
commit 4c32a7239d
2598 changed files with 164595 additions and 487 deletions

View File

@@ -0,0 +1,8 @@
---
dependencies:
- role: kubernetes-apps/container_engine_accelerator/nvidia_gpu
when: nvidia_accelerator_enabled
tags:
- apps
- nvidia_gpu
- container_engine_accelerator

View File

@@ -0,0 +1,14 @@
---
nvidia_accelerator_enabled: false
nvidia_driver_version: "390.87"
nvidia_gpu_tesla_base_url: https://us.download.nvidia.com/tesla/
nvidia_gpu_gtx_base_url: http://us.download.nvidia.com/XFree86/Linux-x86_64/
nvidia_gpu_flavor: tesla
nvidia_url_end: "{{ nvidia_driver_version }}/NVIDIA-Linux-x86_64-{{ nvidia_driver_version }}.run"
nvidia_driver_install_container: false
nvidia_driver_install_centos_container: atzedevries/nvidia-centos-driver-installer:2
nvidia_driver_install_ubuntu_container: registry.k8s.io/ubuntu-nvidia-driver-installer@sha256:7df76a0f0a17294e86f691c81de6bbb7c04a1b4b3d4ea4e7e2cccdc42e1f6d63
nvidia_driver_install_supported: false
nvidia_gpu_device_plugin_container: "registry.k8s.io/nvidia-gpu-device-plugin@sha256:0842734032018be107fa2490c98156992911e3e1f2a21e059ff0105b07dd8e9e"
nvidia_gpu_nodes: []
nvidia_gpu_device_plugin_memory: 30Mi

View File

@@ -0,0 +1,55 @@
---
- name: Container Engine Acceleration Nvidia GPU| gather os specific variables
include_vars: "{{ item }}"
with_first_found:
- files:
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_version|lower|replace('/', '_') }}.yml"
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_release }}.yml"
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version|lower|replace('/', '_') }}.yml"
- "{{ ansible_distribution|lower }}.yml"
- "{{ ansible_os_family|lower }}.yml"
skip: true
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url Tesla
set_fact:
nvidia_driver_download_url_default: "{{ nvidia_gpu_tesla_base_url }}{{ nvidia_url_end }}"
when: nvidia_gpu_flavor|lower == "tesla"
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url GTX
set_fact:
nvidia_driver_download_url_default: "{{ nvidia_gpu_gtx_base_url }}{{ nvidia_url_end }}"
when: nvidia_gpu_flavor|lower == "gtx"
- name: Container Engine Acceleration Nvidia GPU | Create addon dir
file:
path: "{{ kube_config_dir }}/addons/container_engine_accelerator"
owner: root
group: root
mode: 0755
recurse: true
- name: Container Engine Acceleration Nvidia GPU | Create manifests for nvidia accelerators
template:
src: "{{ item.file }}.j2"
dest: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.file }}"
mode: 0644
with_items:
- { name: nvidia-driver-install-daemonset, file: nvidia-driver-install-daemonset.yml, type: daemonset }
- { name: k8s-device-plugin-nvidia-daemonset, file: k8s-device-plugin-nvidia-daemonset.yml, type: daemonset }
register: container_engine_accelerator_manifests
when:
- inventory_hostname == groups['kube_control_plane'][0] and nvidia_driver_install_container
- name: Container Engine Acceleration Nvidia GPU | Apply manifests for nvidia accelerators
kube:
name: "{{ item.item.name }}"
namespace: "kube-system"
kubectl: "{{ bin_dir }}/kubectl"
resource: "{{ item.item.type }}"
filename: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.item.file }}"
state: "latest"
with_items:
- "{{ container_engine_accelerator_manifests.results }}"
when:
- inventory_hostname == groups['kube_control_plane'][0] and nvidia_driver_install_container and nvidia_driver_install_supported

View File

@@ -0,0 +1,60 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-gpu-device-plugin
namespace: kube-system
labels:
k8s-app: nvidia-gpu-device-plugin
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: nvidia-gpu-device-plugin
template:
metadata:
labels:
k8s-app: nvidia-gpu-device-plugin
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nvidia.com/gpu"
operator: Exists
tolerations:
- operator: "Exists"
effect: "NoExecute"
- operator: "Exists"
effect: "NoSchedule"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostPID: true
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev
hostPath:
path: /dev
containers:
- image: "{{ nvidia_gpu_device_plugin_container }}"
command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"]
name: nvidia-gpu-device-plugin
resources:
requests:
cpu: 50m
memory: {{ nvidia_gpu_device_plugin_memory }}
limits:
cpu: 50m
memory: {{ nvidia_gpu_device_plugin_memory }}
securityContext:
privileged: true
volumeMounts:
- name: device-plugin
mountPath: /device-plugin
- name: dev
mountPath: /dev
updateStrategy:
type: RollingUpdate

View File

@@ -0,0 +1,82 @@
# Copyright 2017 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-driver-installer
template:
metadata:
labels:
name: nvidia-driver-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nvidia.com/gpu"
operator: Exists
tolerations:
- key: "nvidia.com/gpu"
effect: "NoSchedule"
operator: "Exists"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: root-mount
hostPath:
path: /
initContainers:
- image: "{{ nvidia_driver_install_container }}"
name: nvidia-driver-installer
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
env:
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: ROOT_MOUNT_DIR
value: /root
- name: NVIDIA_DRIVER_VERSION
value: "{{ nvidia_driver_version }}"
- name: NVIDIA_DRIVER_DOWNLOAD_URL
value: "{{ nvidia_driver_download_url_default }}"
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
containers:
- image: "{{ pod_infra_image_repo }}:{{ pod_infra_image_tag }}"
name: pause

View File

@@ -0,0 +1,3 @@
---
nvidia_driver_install_container: "{{ nvidia_driver_install_centos_container }}"
nvidia_driver_install_supported: true

View File

@@ -0,0 +1,3 @@
---
nvidia_driver_install_container: "{{ nvidia_driver_install_ubuntu_container }}"
nvidia_driver_install_supported: true

View File

@@ -0,0 +1,3 @@
---
nvidia_driver_install_container: "{{ nvidia_driver_install_ubuntu_container }}"
nvidia_driver_install_supported: true