From 0912f36c1b867f4dbcccbec6e46e5530d433fa16 Mon Sep 17 00:00:00 2001 From: nomadics9 Date: Sun, 12 Jan 2025 04:11:56 +0300 Subject: [PATCH] gpu-operator values --- charts/gpu-operator/values.yaml | 442 +++++++++++++++++++++++++++++++- 1 file changed, 431 insertions(+), 11 deletions(-) diff --git a/charts/gpu-operator/values.yaml b/charts/gpu-operator/values.yaml index c0af3a0..7001a6a 100644 --- a/charts/gpu-operator/values.yaml +++ b/charts/gpu-operator/values.yaml @@ -1,15 +1,435 @@ -toolkit: +ccManager: + defaultMode: 'off' + enabled: false env: - - name: CONTAINERD_CONFIG - value: "/etc/containerd/config.toml.tmpl" - - name: CONTAINERD_SOCKET - value: "/run/k3s/containerd/containerd.sock" - - name: CONTAINERD_RUNTIME_CLASS - value: "nvidia" - - name: CONTAINERD_SET_AS_DEFAULT - value: "true" - + - name: CC_CAPABLE_DEVICE_IDS + value: 0x2339,0x2331,0x2330,0x2324,0x2322,0x233d + image: k8s-cc-manager + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + resources: {} + version: v0.1.1 +cdi: + default: false + enabled: false +daemonsets: + annotations: {} + labels: {} + priorityClassName: system-node-critical + rollingUpdate: + maxUnavailable: '1' + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + updateStrategy: RollingUpdate +dcgm: + args: [] + enabled: false + env: [] + image: dcgm + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + resources: {} + version: 3.3.9-1-ubuntu22.04 +dcgmExporter: + enabled: true + env: + - name: DCGM_EXPORTER_LISTEN + value: :9400 + - name: DCGM_EXPORTER_KUBERNETES + value: 'true' + - name: DCGM_EXPORTER_COLLECTORS + value: /etc/dcgm-exporter/dcp-metrics-included.csv + image: dcgm-exporter + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/k8s + resources: {} + serviceMonitor: + additionalLabels: {} + enabled: false + honorLabels: false + interval: 15s + relabelings: [] + version: 3.3.9-3.6.1-ubuntu22.04 devicePlugin: + args: [] config: - name: time-slicing-config-all + create: false + data: {} default: any + name: time-slicing-config-all + enabled: true + env: + - name: PASS_DEVICE_SPECS + value: 'true' + - name: FAIL_ON_INIT_ERROR + value: 'true' + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + mps: + root: /run/nvidia/mps + repository: nvcr.io/nvidia + resources: {} + version: v0.17.0 +driver: + certConfig: + name: '' + enabled: true + env: [] + image: driver + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + kernelModuleConfig: + name: '' + licensingConfig: + configMapName: '' + nlsEnabled: true + manager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: 'true' + - name: ENABLE_AUTO_DRAIN + value: 'false' + - name: DRAIN_USE_FORCE + value: 'false' + - name: DRAIN_POD_SELECTOR_LABEL + value: '' + - name: DRAIN_TIMEOUT_SECONDS + value: 0s + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: 'false' + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.7.0 + nvidiaDriverCRD: + deployDefaultCR: true + driverType: gpu + enabled: false + nodeSelector: {} + rdma: + enabled: false + useHostMofed: false + repoConfig: + configMapName: '' + repository: nvcr.io/nvidia + resources: {} + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + podSelector: '' + timeoutSeconds: 300 + gpuPodDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + waitForCompletion: + podSelector: '' + timeoutSeconds: 0 + useOpenKernelModules: false + usePrecompiled: false + version: 550.127.08 + virtualTopology: + config: '' +gdrcopy: + args: [] + enabled: false + env: [] + image: gdrdrv + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + version: v2.4.1-2 +gds: + args: [] + enabled: false + env: [] + image: nvidia-fs + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + version: 2.20.5 +gfd: + enabled: true + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: 'true' + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia + resources: {} + version: v0.17.0 +hostPaths: + driverInstallDir: /run/nvidia/driver + rootFS: / +kataManager: + config: + artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + runtimeClasses: + - artifacts: + pullSecret: '' + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 + name: kata-nvidia-gpu + nodeSelector: {} + - artifacts: + pullSecret: '' + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp + name: kata-nvidia-gpu-snp + nodeSelector: + nvidia.com/cc.capable: 'true' + enabled: false + env: [] + image: k8s-kata-manager + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + resources: {} + version: v0.2.2 +mig: + strategy: single +migManager: + config: + create: false + data: {} + default: all-disabled + name: '' + enabled: true + env: + - name: WITH_REBOOT + value: 'false' + gpuClientsConfig: + name: '' + image: k8s-mig-manager + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + resources: {} + version: v0.10.0-ubuntu20.04 +nfd: + enabled: true + nodefeaturerules: false +node-feature-discovery: + enableNodeFeatureApi: true + gc: + enable: true + replicaCount: 1 + serviceAccount: + create: false + name: node-feature-discovery + master: + config: + extraLabelNs: + - nvidia.com + serviceAccount: + create: true + name: node-feature-discovery + priorityClassName: system-node-critical + worker: + config: + sources: + pci: + deviceClassWhitelist: + - '02' + - '0200' + - '0207' + - '0300' + - '0302' + deviceLabelFields: + - vendor + serviceAccount: + create: false + name: node-feature-discovery + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Equal + value: '' + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Equal + value: '' + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +nodeStatusExporter: + enabled: false + image: gpu-operator-validator + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + resources: {} +operator: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/master + operator: In + values: + - '' + weight: 1 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: In + values: + - '' + weight: 1 + annotations: + openshift.io/scc: restricted-readonly + cleanupCRD: false + defaultRuntime: docker + image: gpu-operator + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + initContainer: + image: cuda + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: 12.6.3-base-ubi9 + logging: + develMode: false + level: info + timeEncoding: epoch + priorityClassName: system-node-critical + repository: nvcr.io/nvidia + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + runtimeClass: nvidia + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Equal + value: '' + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Equal + value: '' + upgradeCRD: true + use_ocp_driver_toolkit: false +platform: + openshift: false +psa: + enabled: false +sandboxDevicePlugin: + args: [] + enabled: true + env: [] + image: kubevirt-gpu-device-plugin + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia + resources: {} + version: v1.2.10 +sandboxWorkloads: + defaultWorkload: container + enabled: false +toolkit: + enabled: true + env: + - name: CONTAINERD_CONFIG + value: /etc/containerd/config.toml.tmpl + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + - name: CONTAINERD_RUNTIME_CLASS + value: nvidia + - name: CONTAINERD_SET_AS_DEFAULT + value: 'true' + image: container-toolkit + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + installDir: /usr/local/nvidia + repository: nvcr.io/nvidia/k8s + resources: {} + version: v1.17.3-ubuntu20.04 +validator: + args: [] + env: [] + image: gpu-operator-validator + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + plugin: + env: + - name: WITH_WORKLOAD + value: 'false' + repository: nvcr.io/nvidia/cloud-native + resources: {} +vfioManager: + driverManager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: 'false' + - name: ENABLE_AUTO_DRAIN + value: 'false' + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.7.0 + enabled: true + env: [] + image: cuda + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia + resources: {} + version: 12.6.3-base-ubi9 +vgpuDeviceManager: + config: + default: default + name: '' + enabled: true + env: [] + image: vgpu-device-manager + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: nvcr.io/nvidia/cloud-native + version: v0.2.8 +vgpuManager: + driverManager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: 'false' + - name: ENABLE_AUTO_DRAIN + value: 'false' + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.7.0 + enabled: false + env: [] + image: vgpu-manager + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + repository: '' + resources: {} + version: ''