From a2b2bd17c5c0076dbc285ba3f69bc771127bd589 Mon Sep 17 00:00:00 2001 From: nomadics9 Date: Sun, 12 Jan 2025 04:03:04 +0300 Subject: [PATCH] added gpu-operator --- charts/gpu-operator/.helmignore | 22 + charts/gpu-operator/Chart.lock | 6 + charts/gpu-operator/Chart.yaml | 23 + .../charts/node-feature-discovery/.helmignore | 23 + .../charts/node-feature-discovery/Chart.yaml | 14 + .../charts/node-feature-discovery/README.md | 10 + .../crds/nfd-api-crds.yaml | 710 +++++ .../templates/_helpers.tpl | 107 + .../templates/cert-manager-certs.yaml | 80 + .../templates/cert-manager-issuer.yaml | 42 + .../templates/clusterrole.yaml | 133 + .../templates/clusterrolebinding.yaml | 52 + .../templates/master.yaml | 152 ++ .../templates/nfd-gc.yaml | 85 + .../templates/nfd-master-conf.yaml | 12 + .../templates/nfd-topologyupdater-conf.yaml | 12 + .../templates/nfd-worker-conf.yaml | 12 + .../templates/post-delete-job.yaml | 94 + .../templates/prometheus.yaml | 26 + .../templates/role.yaml | 24 + .../templates/rolebinding.yaml | 18 + .../templates/service.yaml | 20 + .../templates/serviceaccount.yaml | 58 + .../templates/topologyupdater-crds.yaml | 278 ++ .../templates/topologyupdater.yaml | 171 ++ .../templates/worker.yaml | 186 ++ .../charts/node-feature-discovery/values.yaml | 593 ++++ .../crds/nvidia.com_clusterpolicies.yaml | 2384 +++++++++++++++++ .../crds/nvidia.com_nvidiadrivers.yaml | 797 ++++++ charts/gpu-operator/templates/_helpers.tpl | 80 + .../gpu-operator/templates/cleanup_crd.yaml | 45 + .../gpu-operator/templates/clusterpolicy.yaml | 683 +++++ .../gpu-operator/templates/clusterrole.yaml | 146 + .../templates/clusterrolebinding.yaml | 18 + .../templates/dcgm_exporter_config.yaml | 14 + charts/gpu-operator/templates/mig_config.yaml | 10 + .../templates/nodefeaturerules.yaml | 107 + .../gpu-operator/templates/nvidiadriver.yaml | 119 + charts/gpu-operator/templates/operator.yaml | 99 + .../gpu-operator/templates/plugin_config.yaml | 11 + .../templates/readonlyfs_scc.openshift.yaml | 49 + charts/gpu-operator/templates/role.yaml | 84 + .../gpu-operator/templates/rolebinding.yaml | 15 + .../templates/serviceaccount.yaml | 7 + .../gpu-operator/templates/upgrade_crd.yaml | 95 + charts/gpu-operator/values.yaml | 15 + charts/gpu-operator/values.yaml.bk | 602 +++++ resources/gpu-slice/configmap.yaml | 15 + 48 files changed, 8358 insertions(+) create mode 100644 charts/gpu-operator/.helmignore create mode 100644 charts/gpu-operator/Chart.lock create mode 100644 charts/gpu-operator/Chart.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/.helmignore create mode 100644 charts/gpu-operator/charts/node-feature-discovery/Chart.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/README.md create mode 100644 charts/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/_helpers.tpl create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/master.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/nfd-master-conf.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/nfd-worker-conf.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/role.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/rolebinding.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/service.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater-crds.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/templates/worker.yaml create mode 100644 charts/gpu-operator/charts/node-feature-discovery/values.yaml create mode 100644 charts/gpu-operator/crds/nvidia.com_clusterpolicies.yaml create mode 100644 charts/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml create mode 100644 charts/gpu-operator/templates/_helpers.tpl create mode 100644 charts/gpu-operator/templates/cleanup_crd.yaml create mode 100644 charts/gpu-operator/templates/clusterpolicy.yaml create mode 100644 charts/gpu-operator/templates/clusterrole.yaml create mode 100644 charts/gpu-operator/templates/clusterrolebinding.yaml create mode 100644 charts/gpu-operator/templates/dcgm_exporter_config.yaml create mode 100644 charts/gpu-operator/templates/mig_config.yaml create mode 100644 charts/gpu-operator/templates/nodefeaturerules.yaml create mode 100644 charts/gpu-operator/templates/nvidiadriver.yaml create mode 100644 charts/gpu-operator/templates/operator.yaml create mode 100644 charts/gpu-operator/templates/plugin_config.yaml create mode 100644 charts/gpu-operator/templates/readonlyfs_scc.openshift.yaml create mode 100644 charts/gpu-operator/templates/role.yaml create mode 100644 charts/gpu-operator/templates/rolebinding.yaml create mode 100644 charts/gpu-operator/templates/serviceaccount.yaml create mode 100644 charts/gpu-operator/templates/upgrade_crd.yaml create mode 100644 charts/gpu-operator/values.yaml create mode 100644 charts/gpu-operator/values.yaml.bk create mode 100644 resources/gpu-slice/configmap.yaml diff --git a/charts/gpu-operator/.helmignore b/charts/gpu-operator/.helmignore new file mode 100644 index 0000000..50af031 --- /dev/null +++ b/charts/gpu-operator/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/gpu-operator/Chart.lock b/charts/gpu-operator/Chart.lock new file mode 100644 index 0000000..5d1a7d3 --- /dev/null +++ b/charts/gpu-operator/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: node-feature-discovery + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts + version: 0.16.6 +digest: sha256:e7b02cbdf9daff49892c0b74c50da2ed11e18eff2105a1b1abc9a8f2ebd8be47 +generated: "2024-10-31T07:12:50.141904-07:00" diff --git a/charts/gpu-operator/Chart.yaml b/charts/gpu-operator/Chart.yaml new file mode 100644 index 0000000..cceaa6e --- /dev/null +++ b/charts/gpu-operator/Chart.yaml @@ -0,0 +1,23 @@ +apiVersion: v2 +appVersion: v24.9.1 +dependencies: +- condition: nfd.enabled + name: node-feature-discovery + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts + version: v0.16.6 +description: NVIDIA GPU Operator creates/configures/manages GPUs atop Kubernetes +home: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/overview.html +icon: https://assets.nvidiagrid.net/ngc/logos/GPUoperator.png +keywords: +- gpu +- cuda +- compute +- operator +- deep learning +- monitoring +- tesla +kubeVersion: '>= 1.16.0-0' +name: gpu-operator +sources: +- https://github.com/NVIDIA/gpu-operator +version: v24.9.1 diff --git a/charts/gpu-operator/charts/node-feature-discovery/.helmignore b/charts/gpu-operator/charts/node-feature-discovery/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/gpu-operator/charts/node-feature-discovery/Chart.yaml b/charts/gpu-operator/charts/node-feature-discovery/Chart.yaml new file mode 100644 index 0000000..7656c73 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +appVersion: v0.16.6 +description: 'Detects hardware features available on each node in a Kubernetes cluster, + and advertises those features using node labels. ' +home: https://github.com/kubernetes-sigs/node-feature-discovery +keywords: +- feature-discovery +- feature-detection +- node-labels +name: node-feature-discovery +sources: +- https://github.com/kubernetes-sigs/node-feature-discovery +type: application +version: 0.16.6 diff --git a/charts/gpu-operator/charts/node-feature-discovery/README.md b/charts/gpu-operator/charts/node-feature-discovery/README.md new file mode 100644 index 0000000..93734f8 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/README.md @@ -0,0 +1,10 @@ +# Node Feature Discovery + +Node Feature Discovery (NFD) is a Kubernetes add-on for detecting hardware +features and system configuration. Detected features are advertised as node +labels. NFD provides flexible configuration and extension points for a wide +range of vendor and application specific node labeling needs. + +See +[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html) +for deployment instructions. diff --git a/charts/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml b/charts/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml new file mode 100644 index 0000000..0a73c5d --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml @@ -0,0 +1,710 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeatures.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeature + listKind: NodeFeatureList + plural: nodefeatures + singular: nodefeature + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + NodeFeature resource holds the features discovered for one node in the + cluster. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Specification of the NodeFeature, containing features discovered + for a node. + properties: + features: + description: Features is the full "raw" features data that has been + discovered. + properties: + attributes: + additionalProperties: + description: AttributeFeatureSet is a set of features having + string value. + properties: + elements: + additionalProperties: + type: string + description: Individual features of the feature set. + type: object + required: + - elements + type: object + description: Attributes contains all the attribute-type features + of the node. + type: object + flags: + additionalProperties: + description: FlagFeatureSet is a set of simple features only + containing names without values. + properties: + elements: + additionalProperties: + description: Nil is a dummy empty struct for protobuf + compatibility + type: object + description: Individual features of the feature set. + type: object + required: + - elements + type: object + description: Flags contains all the flag-type features of the + node. + type: object + instances: + additionalProperties: + description: InstanceFeatureSet is a set of features each of + which is an instance having multiple attributes. + properties: + elements: + description: Individual features of the feature set. + items: + description: InstanceFeature represents one instance of + a complex features, e.g. a device. + properties: + attributes: + additionalProperties: + type: string + description: Attributes of the instance feature. + type: object + required: + - attributes + type: object + type: array + required: + - elements + type: object + description: Instances contains all the instance-type features + of the node. + type: object + type: object + labels: + additionalProperties: + type: string + description: Labels is the set of node labels that are requested to + be created. + type: object + type: object + required: + - spec + type: object + served: true + storage: true +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeaturegroups.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureGroup + listKind: NodeFeatureGroupList + plural: nodefeaturegroups + shortNames: + - nfg + singular: nodefeaturegroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeFeatureGroup resource holds Node pools by featureGroup + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + featureGroupRules: + description: List of rules to evaluate to determine nodes that belong + in this group. + items: + description: GroupRule defines a rule for nodegroup filtering. + properties: + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + required: + - name + type: object + type: array + required: + - featureGroupRules + type: object + status: + description: |- + Status of the NodeFeatureGroup after the most recent evaluation of the + specification. + properties: + nodes: + description: Nodes is a list of FeatureGroupNode in the cluster that + match the featureGroupRules + items: + properties: + name: + description: Name of the node. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeaturerules.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureRule + listKind: NodeFeatureRuleList + plural: nodefeaturerules + shortNames: + - nfr + singular: nodefeaturerule + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + NodeFeatureRule resource specifies a configuration for feature-based + customization of node objects, such as node labeling. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + rules: + description: Rules is a list of node customization rules. + items: + description: Rule defines a rule for node customization such as + labeling. + properties: + annotations: + additionalProperties: + type: string + description: Annotations to create if the rule matches. + type: object + extendedResources: + additionalProperties: + type: string + description: ExtendedResources to create if the rule matches. + type: object + labels: + additionalProperties: + type: string + description: Labels to create if the rule matches. + type: object + labelsTemplate: + description: |- + LabelsTemplate specifies a template to expand for dynamically generating + multiple labels. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. + type: string + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + taints: + description: Taints to create if the rule matches. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to + a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint + key. + type: string + required: + - effect + - key + type: object + type: array + vars: + additionalProperties: + type: string + description: |- + Vars is the variables to store if the rule matches. Variables do not + directly inflict any changes in the node object. However, they can be + referenced from other rules enabling more complex rule hierarchies, + without exposing intermediary output values as labels. + type: object + varsTemplate: + description: |- + VarsTemplate specifies a template to expand for dynamically generating + multiple variables. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. + type: string + required: + - name + type: object + type: array + required: + - rules + type: object + required: + - spec + type: object + served: true + storage: true diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/_helpers.tpl b/charts/gpu-operator/charts/node-feature-discovery/templates/_helpers.tpl new file mode 100644 index 0000000..928ece7 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/_helpers.tpl @@ -0,0 +1,107 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "node-feature-discovery.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "node-feature-discovery.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Allow the release namespace to be overridden for multi-namespace deployments in combined charts +*/}} +{{- define "node-feature-discovery.namespace" -}} + {{- if .Values.namespaceOverride -}} + {{- .Values.namespaceOverride -}} + {{- else -}} + {{- .Release.Namespace -}} + {{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "node-feature-discovery.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "node-feature-discovery.labels" -}} +helm.sh/chart: {{ include "node-feature-discovery.chart" . }} +{{ include "node-feature-discovery.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "node-feature-discovery.selectorLabels" -}} +app.kubernetes.io/name: {{ include "node-feature-discovery.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account which the nfd master will use +*/}} +{{- define "node-feature-discovery.master.serviceAccountName" -}} +{{- if .Values.master.serviceAccount.create -}} + {{ default (include "node-feature-discovery.fullname" .) .Values.master.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.master.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the service account which the nfd worker will use +*/}} +{{- define "node-feature-discovery.worker.serviceAccountName" -}} +{{- if .Values.worker.serviceAccount.create -}} + {{ default (printf "%s-worker" (include "node-feature-discovery.fullname" .)) .Values.worker.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.worker.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the service account which topologyUpdater will use +*/}} +{{- define "node-feature-discovery.topologyUpdater.serviceAccountName" -}} +{{- if .Values.topologyUpdater.serviceAccount.create -}} + {{ default (printf "%s-topology-updater" (include "node-feature-discovery.fullname" .)) .Values.topologyUpdater.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.topologyUpdater.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the service account which nfd-gc will use +*/}} +{{- define "node-feature-discovery.gc.serviceAccountName" -}} +{{- if .Values.gc.serviceAccount.create -}} + {{ default (printf "%s-gc" (include "node-feature-discovery.fullname" .)) .Values.gc.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.gc.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml new file mode 100644 index 0000000..2d15760 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml @@ -0,0 +1,80 @@ +{{- if .Values.tls.certManager }} +{{- if .Values.master.enable }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-master-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + secretName: nfd-master-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-master + dnsNames: + # must match the service name + - {{ include "node-feature-discovery.fullname" . }}-master + # first one is configured for use by the worker; below are for completeness + - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc + - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local + issuerRef: + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} + kind: Issuer + {{- end }} + group: cert-manager.io +{{- end }} +--- +{{- if .Values.worker.enable }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-worker-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + secretName: nfd-worker-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-worker + dnsNames: + - {{ include "node-feature-discovery.fullname" . }}-worker.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local + issuerRef: + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} + kind: Issuer + {{- end }} + group: cert-manager.io +{{- end }} + +{{- if .Values.topologyUpdater.enable }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-topology-updater-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + secretName: nfd-topology-updater-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-topology-updater + dnsNames: + - {{ include "node-feature-discovery.fullname" . }}-topology-updater.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local + issuerRef: + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} + kind: Issuer + {{- end }} + group: cert-manager.io +{{- end }} + +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml new file mode 100644 index 0000000..8744689 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml @@ -0,0 +1,42 @@ +{{- if and .Values.tls.certManager (not .Values.tls.certManagerCertificate.issuerName ) }} +# See https://cert-manager.io/docs/configuration/selfsigned/#bootstrapping-ca-issuers +# - Create a self signed issuer +# - Use this to create a CA cert +# - Use this to now create a CA issuer +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: nfd-ca-bootstrap + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + selfSigned: {} + +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-ca-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + isCA: true + secretName: nfd-ca-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-ca-cert + issuerRef: + name: nfd-ca-bootstrap + kind: Issuer + group: cert-manager.io + +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: nfd-ca-issuer + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + ca: + secretName: nfd-ca-cert +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml new file mode 100644 index 0000000..f935cfe --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml @@ -0,0 +1,133 @@ +{{- if and .Values.master.enable .Values.master.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - patch + - update + - list +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + - nodefeaturerules + - nodefeaturegroups + verbs: + - get + - list + - watch +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeaturegroups/status + verbs: + - patch + - update +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create +- apiGroups: + - coordination.k8s.io + resources: + - leases + resourceNames: + - "nfd-master.nfd.kubernetes.io" + verbs: + - get + - update +{{- end }} + +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get +- apiGroups: + - "" + resources: + - nodes/proxy + verbs: + - get +- apiGroups: + - "" + resources: + - pods + verbs: + - get +- apiGroups: + - topology.node.k8s.io + resources: + - noderesourcetopologies + verbs: + - create + - get + - update +{{- end }} + +{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-gc + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - list + - watch +- apiGroups: + - "" + resources: + - nodes/proxy + verbs: + - get +- apiGroups: + - topology.node.k8s.io + resources: + - noderesourcetopologies + verbs: + - delete + - list +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - delete + - list +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml new file mode 100644 index 0000000..3f71798 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml @@ -0,0 +1,52 @@ +{{- if and .Values.master.enable .Values.master.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }} +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.master.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} + +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} + +{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-gc + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-gc +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.gc.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/master.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/master.yaml new file mode 100644 index 0000000..733131a --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/master.yaml @@ -0,0 +1,152 @@ +{{- if .Values.master.enable }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-master + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: master + {{- with .Values.master.deploymentAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.master.replicaCount }} + revisionHistoryLimit: {{ .Values.master.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: master + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: master + {{- with .Values.master.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "node-feature-discovery.master.serviceAccountName" . }} + enableServiceLinks: false + securityContext: + {{- toYaml .Values.master.podSecurityContext | nindent 8 }} + hostNetwork: {{ .Values.master.hostNetwork }} + containers: + - name: master + securityContext: + {{- toYaml .Values.master.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + {{- toYaml .Values.master.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.master.readinessProbe | nindent 12 }} + ports: + - containerPort: {{ .Values.master.port | default "8080" }} + name: grpc + - containerPort: {{ .Values.master.metricsPort | default "8081" }} + name: metrics + - containerPort: {{ .Values.master.healthPort | default "8082" }} + name: health + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + {{- with .Values.master.extraEnvs }} + {{- toYaml . | nindent 8 }} + {{- end}} + command: + - "nfd-master" + resources: + {{- toYaml .Values.master.resources | nindent 12 }} + args: + {{- if .Values.master.instance | empty | not }} + - "-instance={{ .Values.master.instance }}" + {{- end }} + {{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} + - "-port={{ .Values.master.port | default "8080" }}" + {{- else if gt (int .Values.master.replicaCount) 1 }} + - "-enable-leader-election" + {{- end }} + {{- if .Values.master.extraLabelNs | empty | not }} + - "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}" + {{- end }} + {{- if .Values.master.denyLabelNs | empty | not }} + - "-deny-label-ns={{- join "," .Values.master.denyLabelNs }}" + {{- end }} + {{- if .Values.master.resourceLabels | empty | not }} + - "-resource-labels={{- join "," .Values.master.resourceLabels }}" + {{- end }} + {{- if .Values.master.enableTaints }} + - "-enable-taints" + {{- end }} + {{- if .Values.master.crdController | kindIs "invalid" | not }} + - "-crd-controller={{ .Values.master.crdController }}" + {{- else }} + ## By default, disable crd controller for other than the default instances + - "-crd-controller={{ .Values.master.instance | empty }}" + {{- end }} + {{- if .Values.master.featureRulesController | kindIs "invalid" | not }} + - "-featurerules-controller={{ .Values.master.featureRulesController }}" + {{- end }} + {{- if .Values.master.resyncPeriod }} + - "-resync-period={{ .Values.master.resyncPeriod }}" + {{- end }} + {{- if .Values.master.nfdApiParallelism | empty | not }} + - "-nfd-api-parallelism={{ .Values.master.nfdApiParallelism }}" + {{- end }} + {{- if .Values.tls.enable }} + - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" + - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" + - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" + {{- end }} + # Go over featureGates and add the feature-gate flag + {{- range $key, $value := .Values.featureGates }} + - "-feature-gates={{ $key }}={{ $value }}" + {{- end }} + - "-metrics={{ .Values.master.metricsPort | default "8081" }}" + - "-grpc-health={{ .Values.master.healthPort | default "8082" }}" + volumeMounts: + {{- if .Values.tls.enable }} + - name: nfd-master-cert + mountPath: "/etc/kubernetes/node-feature-discovery/certs" + readOnly: true + {{- end }} + - name: nfd-master-conf + mountPath: "/etc/kubernetes/node-feature-discovery" + readOnly: true + volumes: + {{- if .Values.tls.enable }} + - name: nfd-master-cert + secret: + secretName: nfd-master-cert + {{- end }} + - name: nfd-master-conf + configMap: + name: {{ include "node-feature-discovery.fullname" . }}-master-conf + items: + - key: nfd-master.conf + path: nfd-master.conf + {{- with .Values.master.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml new file mode 100644 index 0000000..375f938 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml @@ -0,0 +1,85 @@ +{{- if and .Values.gc.enable (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-gc + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: gc + {{- with .Values.gc.deploymentAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.gc.replicaCount | default 1 }} + revisionHistoryLimit: {{ .Values.gc.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: gc + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: gc + {{- with .Values.gc.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "node-feature-discovery.gc.serviceAccountName" . }} + dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.gc.podSecurityContext | nindent 8 }} + hostNetwork: {{ .Values.gc.hostNetwork }} + containers: + - name: gc + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: "{{ .Values.image.pullPolicy }}" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + {{- with .Values.gc.extraEnvs }} + {{- toYaml . | nindent 8 }} + {{- end}} + command: + - "nfd-gc" + args: + {{- if .Values.gc.interval | empty | not }} + - "-gc-interval={{ .Values.gc.interval }}" + {{- end }} + resources: + {{- toYaml .Values.gc.resources | nindent 12 }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + ports: + - name: metrics + containerPort: {{ .Values.gc.metricsPort | default "8081"}} + + {{- with .Values.gc.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.gc.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.gc.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-master-conf.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-master-conf.yaml new file mode 100644 index 0000000..9c6e01c --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-master-conf.yaml @@ -0,0 +1,12 @@ +{{- if .Values.master.enable }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-master-conf + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +data: + nfd-master.conf: |- + {{- .Values.master.config | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml new file mode 100644 index 0000000..8d03aa2 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml @@ -0,0 +1,12 @@ +{{- if .Values.topologyUpdater.enable -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +data: + nfd-topology-updater.conf: |- + {{- .Values.topologyUpdater.config | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-worker-conf.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-worker-conf.yaml new file mode 100644 index 0000000..a2299de --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/nfd-worker-conf.yaml @@ -0,0 +1,12 @@ +{{- if .Values.worker.enable }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker-conf + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +data: + nfd-worker.conf: |- + {{- .Values.worker.config | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml new file mode 100644 index 0000000..4364f1a --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml @@ -0,0 +1,94 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - patch + - update + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-prune +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + template: + metadata: + labels: + {{- include "node-feature-discovery.labels" . | nindent 8 }} + role: prune + spec: + serviceAccountName: {{ include "node-feature-discovery.fullname" . }}-prune + containers: + - name: nfd-master + securityContext: + {{- toYaml .Values.master.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - "nfd-master" + args: + - "-prune" + {{- if .Values.master.instance | empty | not }} + - "-instance={{ .Values.master.instance }}" + {{- end }} + restartPolicy: Never + {{- with .Values.master.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml new file mode 100644 index 0000000..3d680e2 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml @@ -0,0 +1,26 @@ +{{- if .Values.prometheus.enable }} +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} + {{- with .Values.prometheus.labels }} + {{ toYaml . | nindent 4 }} + {{- end }} +spec: + podMetricsEndpoints: + - honorLabels: true + interval: {{ .Values.prometheus.scrapeInterval }} + path: /metrics + port: metrics + scheme: http + namespaceSelector: + matchNames: + - {{ include "node-feature-discovery.namespace" . }} + selector: + matchExpressions: + - {key: app.kubernetes.io/instance, operator: In, values: ["{{ .Release.Name }}"]} + - {key: app.kubernetes.io/name, operator: In, values: ["{{ include "node-feature-discovery.name" . }}"]} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/role.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/role.yaml new file mode 100644 index 0000000..52c69eb --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/role.yaml @@ -0,0 +1,24 @@ +{{- if and .Values.worker.enable .Values.worker.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - create + - get + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/rolebinding.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/rolebinding.yaml new file mode 100644 index 0000000..a640d5f --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/rolebinding.yaml @@ -0,0 +1,18 @@ +{{- if and .Values.worker.enable .Values.worker.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "node-feature-discovery.fullname" . }}-worker +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.worker.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} + diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/service.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/service.yaml new file mode 100644 index 0000000..7191dca --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/service.yaml @@ -0,0 +1,20 @@ +{{- if and (not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi)) .Values.master.enable }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-master + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: master +spec: + type: {{ .Values.master.service.type }} + ports: + - port: {{ .Values.master.service.port | default "8080" }} + targetPort: grpc + protocol: TCP + name: grpc + selector: + {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} + role: master +{{- end}} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml new file mode 100644 index 0000000..59edc5e --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml @@ -0,0 +1,58 @@ +{{- if and .Values.master.enable .Values.master.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.master.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.master.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.topologyUpdater.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.gc.enable .Values.gc.serviceAccount.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.gc.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.gc.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.worker.enable .Values.worker.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.worker.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.worker.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater-crds.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater-crds.yaml new file mode 100644 index 0000000..b6b9196 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater-crds.yaml @@ -0,0 +1,278 @@ +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.createCRDs -}} +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes/enhancements/pull/1870 + controller-gen.kubebuilder.io/version: v0.11.2 + creationTimestamp: null + name: noderesourcetopologies.topology.node.k8s.io +spec: + group: topology.node.k8s.io + names: + kind: NodeResourceTopology + listKind: NodeResourceTopologyList + plural: noderesourcetopologies + shortNames: + - node-res-topo + singular: noderesourcetopology + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeResourceTopology describes node resources and their topology. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + topologyPolicies: + items: + type: string + type: array + zones: + description: ZoneList contains an array of Zone objects. + items: + description: Zone represents a resource topology zone, e.g. socket, + node, die or core. + properties: + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + costs: + description: CostList contains an array of CostInfo objects. + items: + description: CostInfo describes the cost (or distance) between + two Zones. + properties: + name: + type: string + value: + format: int64 + type: integer + required: + - name + - value + type: object + type: array + name: + type: string + parent: + type: string + resources: + description: ResourceInfoList contains an array of ResourceInfo + objects. + items: + description: ResourceInfo contains information about one resource + type. + properties: + allocatable: + anyOf: + - type: integer + - type: string + description: Allocatable quantity of the resource, corresponding + to allocatable in node status, i.e. total amount of this + resource available to be used by pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + available: + anyOf: + - type: integer + - type: string + description: Available is the amount of this resource currently + available for new (to be scheduled) pods, i.e. Allocatable + minus the resources reserved by currently running pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + capacity: + anyOf: + - type: integer + - type: string + description: Capacity of the resource, corresponding to capacity + in node status, i.e. total amount of this resource that + the node has. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + name: + description: Name of the resource. + type: string + required: + - allocatable + - available + - capacity + - name + type: object + type: array + type: + type: string + required: + - name + - type + type: object + type: array + required: + - topologyPolicies + - zones + type: object + served: true + storage: false + - name: v1alpha2 + schema: + openAPIV3Schema: + description: NodeResourceTopology describes node resources and their topology. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + topologyPolicies: + description: 'DEPRECATED (to be removed in v1beta1): use top level attributes + if needed' + items: + type: string + type: array + zones: + description: ZoneList contains an array of Zone objects. + items: + description: Zone represents a resource topology zone, e.g. socket, + node, die or core. + properties: + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + costs: + description: CostList contains an array of CostInfo objects. + items: + description: CostInfo describes the cost (or distance) between + two Zones. + properties: + name: + type: string + value: + format: int64 + type: integer + required: + - name + - value + type: object + type: array + name: + type: string + parent: + type: string + resources: + description: ResourceInfoList contains an array of ResourceInfo + objects. + items: + description: ResourceInfo contains information about one resource + type. + properties: + allocatable: + anyOf: + - type: integer + - type: string + description: Allocatable quantity of the resource, corresponding + to allocatable in node status, i.e. total amount of this + resource available to be used by pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + available: + anyOf: + - type: integer + - type: string + description: Available is the amount of this resource currently + available for new (to be scheduled) pods, i.e. Allocatable + minus the resources reserved by currently running pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + capacity: + anyOf: + - type: integer + - type: string + description: Capacity of the resource, corresponding to capacity + in node status, i.e. total amount of this resource that + the node has. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + name: + description: Name of the resource. + type: string + required: + - allocatable + - available + - capacity + - name + type: object + type: array + type: + type: string + required: + - name + - type + type: object + type: array + required: + - zones + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml new file mode 100644 index 0000000..ba0214c --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml @@ -0,0 +1,171 @@ +{{- if .Values.topologyUpdater.enable -}} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: topology-updater + {{- with .Values.topologyUpdater.daemonsetAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + revisionHistoryLimit: {{ .Values.topologyUpdater.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: topology-updater + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: topology-updater + {{- with .Values.topologyUpdater.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} + dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.topologyUpdater.podSecurityContext | nindent 8 }} + hostNetwork: {{ .Values.topologyUpdater.hostNetwork }} + containers: + - name: topology-updater + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: "{{ .Values.image.pullPolicy }}" + livenessProbe: + {{- toYaml .Values.topologyUpdater.livenessProbe | nindent 10 }} + readinessProbe: + {{- toYaml .Values.topologyUpdater.readinessProbe | nindent 10 }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_ADDRESS + valueFrom: + fieldRef: + fieldPath: status.hostIP + {{- with .Values.topologyUpdater.extraEnvs }} + {{- toYaml . | nindent 8 }} + {{- end}} + command: + - "nfd-topology-updater" + args: + - "-podresources-socket=/host-var/lib/kubelet-podresources/kubelet.sock" + {{- if .Values.topologyUpdater.updateInterval | empty | not }} + - "-sleep-interval={{ .Values.topologyUpdater.updateInterval }}" + {{- else }} + - "-sleep-interval=3s" + {{- end }} + {{- if .Values.topologyUpdater.watchNamespace | empty | not }} + - "-watch-namespace={{ .Values.topologyUpdater.watchNamespace }}" + {{- else }} + - "-watch-namespace=*" + {{- end }} + {{- if .Values.tls.enable }} + - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" + - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" + - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" + {{- end }} + {{- if not .Values.topologyUpdater.podSetFingerprint }} + - "-pods-fingerprint=false" + {{- end }} + {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} + - "-kubelet-config-uri=file:///host-var/kubelet-config" + {{- end }} + {{- if .Values.topologyUpdater.kubeletStateDir | empty }} + # Disable kubelet state tracking by giving an empty path + - "-kubelet-state-dir=" + {{- end }} + - -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}} + - "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}" + ports: + - containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}} + name: metrics + - containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }} + name: health + volumeMounts: + {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} + - name: kubelet-config + mountPath: /host-var/kubelet-config + {{- end }} + - name: kubelet-podresources-sock + mountPath: /host-var/lib/kubelet-podresources/kubelet.sock + - name: host-sys + mountPath: /host-sys + {{- if .Values.topologyUpdater.kubeletStateDir | empty | not }} + - name: kubelet-state-files + mountPath: /host-var/lib/kubelet + readOnly: true + {{- end }} + {{- if .Values.tls.enable }} + - name: nfd-topology-updater-cert + mountPath: "/etc/kubernetes/node-feature-discovery/certs" + readOnly: true + {{- end }} + - name: nfd-topology-updater-conf + mountPath: "/etc/kubernetes/node-feature-discovery" + readOnly: true + + resources: + {{- toYaml .Values.topologyUpdater.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.topologyUpdater.securityContext | nindent 12 }} + volumes: + - name: host-sys + hostPath: + path: "/sys" + {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} + - name: kubelet-config + hostPath: + path: {{ .Values.topologyUpdater.kubeletConfigPath }} + {{- end }} + - name: kubelet-podresources-sock + hostPath: + {{- if .Values.topologyUpdater.kubeletPodResourcesSockPath | empty | not }} + path: {{ .Values.topologyUpdater.kubeletPodResourcesSockPath }} + {{- else }} + path: /var/lib/kubelet/pod-resources/kubelet.sock + {{- end }} + {{- if .Values.topologyUpdater.kubeletStateDir | empty | not }} + - name: kubelet-state-files + hostPath: + path: {{ .Values.topologyUpdater.kubeletStateDir }} + {{- end }} + - name: nfd-topology-updater-conf + configMap: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf + items: + - key: nfd-topology-updater.conf + path: nfd-topology-updater.conf + {{- if .Values.tls.enable }} + - name: nfd-topology-updater-cert + secret: + secretName: nfd-topology-updater-cert + {{- end }} + + + {{- with .Values.topologyUpdater.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.topologyUpdater.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.topologyUpdater.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/templates/worker.yaml b/charts/gpu-operator/charts/node-feature-discovery/templates/worker.yaml new file mode 100644 index 0000000..755466c --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/templates/worker.yaml @@ -0,0 +1,186 @@ +{{- if .Values.worker.enable }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: worker + {{- with .Values.worker.daemonsetAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + revisionHistoryLimit: {{ .Values.worker.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: worker + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: worker + {{- with .Values.worker.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "node-feature-discovery.worker.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.worker.podSecurityContext | nindent 8 }} + hostNetwork: {{ .Values.worker.hostNetwork }} + containers: + - name: worker + securityContext: + {{- toYaml .Values.worker.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + {{- toYaml .Values.worker.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.worker.readinessProbe | nindent 12 }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + {{- with .Values.worker.extraEnvs }} + {{- toYaml . | nindent 8 }} + {{- end}} + resources: + {{- toYaml .Values.worker.resources | nindent 12 }} + command: + - "nfd-worker" + args: +{{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} + - "-server={{ include "node-feature-discovery.fullname" . }}-master:{{ .Values.master.service.port }}" +{{- end }} +{{- if .Values.tls.enable }} + - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" + - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" + - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" +{{- end }} +# Go over featureGate and add the feature-gate flag +{{- range $key, $value := .Values.featureGates }} + - "-feature-gates={{ $key }}={{ $value }}" +{{- end }} + - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" + - "-grpc-health={{ .Values.worker.healthPort | default "8082" }}" + ports: + - containerPort: {{ .Values.worker.metricsPort | default "8081"}} + name: metrics + - containerPort: {{ .Values.worker.healthPort | default "8082" }} + name: health + volumeMounts: + - name: host-boot + mountPath: "/host-boot" + readOnly: true + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + - name: host-sys + mountPath: "/host-sys" + readOnly: true + - name: host-usr-lib + mountPath: "/host-usr/lib" + readOnly: true + - name: host-lib + mountPath: "/host-lib" + readOnly: true + - name: host-proc-swaps + mountPath: "/host-proc/swaps" + readOnly: true + {{- if .Values.worker.mountUsrSrc }} + - name: host-usr-src + mountPath: "/host-usr/src" + readOnly: true + {{- end }} + - name: source-d + mountPath: "/etc/kubernetes/node-feature-discovery/source.d/" + readOnly: true + - name: features-d + mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" + readOnly: true + - name: nfd-worker-conf + mountPath: "/etc/kubernetes/node-feature-discovery" + readOnly: true +{{- if .Values.tls.enable }} + - name: nfd-worker-cert + mountPath: "/etc/kubernetes/node-feature-discovery/certs" + readOnly: true +{{- end }} + volumes: + - name: host-boot + hostPath: + path: "/boot" + - name: host-os-release + hostPath: + path: "/etc/os-release" + - name: host-sys + hostPath: + path: "/sys" + - name: host-usr-lib + hostPath: + path: "/usr/lib" + - name: host-lib + hostPath: + path: "/lib" + - name: host-proc-swaps + hostPath: + path: "/proc/swaps" + {{- if .Values.worker.mountUsrSrc }} + - name: host-usr-src + hostPath: + path: "/usr/src" + {{- end }} + - name: source-d + hostPath: + path: "/etc/kubernetes/node-feature-discovery/source.d/" + - name: features-d + hostPath: + path: "/etc/kubernetes/node-feature-discovery/features.d/" + - name: nfd-worker-conf + configMap: + name: {{ include "node-feature-discovery.fullname" . }}-worker-conf + items: + - key: nfd-worker.conf + path: nfd-worker.conf +{{- if .Values.tls.enable }} + - name: nfd-worker-cert + secret: + secretName: nfd-worker-cert +{{- end }} + {{- with .Values.worker.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.priorityClassName }} + priorityClassName: {{ . | quote }} + {{- end }} +{{- end }} diff --git a/charts/gpu-operator/charts/node-feature-discovery/values.yaml b/charts/gpu-operator/charts/node-feature-discovery/values.yaml new file mode 100644 index 0000000..2d24983 --- /dev/null +++ b/charts/gpu-operator/charts/node-feature-discovery/values.yaml @@ -0,0 +1,593 @@ +image: + repository: registry.k8s.io/nfd/node-feature-discovery + # This should be set to 'IfNotPresent' for released version + pullPolicy: IfNotPresent + # tag, if defined will use the given image tag, else Chart.AppVersion will be used + # tag +imagePullSecrets: [] + +nameOverride: "" +fullnameOverride: "" +namespaceOverride: "" + +enableNodeFeatureApi: true + +featureGates: + NodeFeatureAPI: true + NodeFeatureGroupAPI: false + +priorityClassName: "" + +master: + enable: true + extraEnvs: [] + hostNetwork: false + config: ### + # noPublish: false + # autoDefaultNs: true + # extraLabelNs: ["added.ns.io","added.kubernets.io"] + # denyLabelNs: ["denied.ns.io","denied.kubernetes.io"] + # resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"] + # enableTaints: false + # labelWhiteList: "foo" + # resyncPeriod: "2h" + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time configurable + ## and require a nfd-master restart to take effect after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false + # leaderElection: + # leaseDuration: 15s + # # this value has to be lower than leaseDuration and greater than retryPeriod*1.2 + # renewDeadline: 10s + # # this value has to be greater than 0 + # retryPeriod: 2s + # nfdApiParallelism: 10 + ### + # The TCP port that nfd-master listens for incoming requests. Default: 8080 + # Deprecated this parameter is related to the deprecated gRPC API and will + # be removed with it in a future release + port: 8080 + metricsPort: 8081 + healthPort: 8082 + instance: + featureApi: + resyncPeriod: + denyLabelNs: [] + extraLabelNs: [] + resourceLabels: [] + enableTaints: false + crdController: null + featureRulesController: null + nfdApiParallelism: null + deploymentAnnotations: {} + replicaCount: 1 + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + # runAsUser: 1000 + + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # specify how many old ReplicaSets for the Deployment to retain. + revisionHistoryLimit: + + rbac: + create: true + + service: + type: ClusterIP + port: 8080 + + resources: + limits: + memory: 4Gi + requests: + cpu: 100m + # You may want to use the same value for `requests.memory` and `limits.memory`. The “requests” value affects scheduling to accommodate pods on nodes. + # If there is a large difference between “requests” and “limits” and nodes experience memory pressure, the kernel may invoke + # the OOM Killer, even if the memory does not exceed the “limits” threshold. This can cause unexpected pod evictions. Memory + # cannot be compressed and once allocated to a pod, it can only be reclaimed by killing the pod. + # Natan Yellin 22/09/2022 https://home.robusta.dev/blog/kubernetes-memory-limit + memory: 128Mi + + nodeSelector: {} + + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + + annotations: {} + + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + # failureThreshold: 3 + # periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + failureThreshold: 10 + # periodSeconds: 10 + +worker: + enable: true + extraEnvs: [] + hostNetwork: false + config: ### + #core: + # labelWhiteList: + # noPublish: false + # sleepInterval: 60s + # featureSources: [all] + # labelSources: [all] + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time configurable + ## and require a nfd-worker restart to take effect after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false + #sources: + # cpu: + # cpuid: + ## NOTE: whitelist has priority over blacklist + # attributeBlacklist: + # - "AVX10" + # - "BMI1" + # - "BMI2" + # - "CLMUL" + # - "CMOV" + # - "CX16" + # - "ERMS" + # - "F16C" + # - "HTT" + # - "LZCNT" + # - "MMX" + # - "MMXEXT" + # - "NX" + # - "POPCNT" + # - "RDRAND" + # - "RDSEED" + # - "RDTSCP" + # - "SGX" + # - "SSE" + # - "SSE2" + # - "SSE3" + # - "SSE4" + # - "SSE42" + # - "SSSE3" + # - "TDX_GUEST" + # attributeWhitelist: + # kernel: + # kconfigFile: "/path/to/kconfig" + # configOpts: + # - "NO_HZ" + # - "X86" + # - "DMI" + # pci: + # deviceClassWhitelist: + # - "0200" + # - "03" + # - "12" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # - "subsystem_vendor" + # - "subsystem_device" + # usb: + # deviceClassWhitelist: + # - "0e" + # - "ef" + # - "fe" + # - "ff" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # local: + # hooksEnabled: false + # custom: + # # The following feature demonstrates the capabilities of the matchFeatures + # - name: "my custom rule" + # labels: + # "vendor.io/my-ng-feature": "true" + # # matchFeatures implements a logical AND over all matcher terms in the + # # list (i.e. all of the terms, or per-feature matchers, must match) + # matchFeatures: + # - feature: cpu.cpuid + # matchExpressions: + # AVX512F: {op: Exists} + # - feature: cpu.cstate + # matchExpressions: + # enabled: {op: IsTrue} + # - feature: cpu.pstate + # matchExpressions: + # no_turbo: {op: IsFalse} + # scaling_governor: {op: In, value: ["performance"]} + # - feature: cpu.rdt + # matchExpressions: + # RDTL3CA: {op: Exists} + # - feature: cpu.sst + # matchExpressions: + # bf.enabled: {op: IsTrue} + # - feature: cpu.topology + # matchExpressions: + # hardware_multithreading: {op: IsFalse} + # + # - feature: kernel.config + # matchExpressions: + # X86: {op: Exists} + # LSM: {op: InRegexp, value: ["apparmor"]} + # - feature: kernel.loadedmodule + # matchExpressions: + # e1000e: {op: Exists} + # - feature: kernel.selinux + # matchExpressions: + # enabled: {op: IsFalse} + # - feature: kernel.version + # matchExpressions: + # major: {op: In, value: ["5"]} + # minor: {op: Gt, value: ["10"]} + # + # - feature: storage.block + # matchExpressions: + # rotational: {op: In, value: ["0"]} + # dax: {op: In, value: ["0"]} + # + # - feature: network.device + # matchExpressions: + # operstate: {op: In, value: ["up"]} + # speed: {op: Gt, value: ["100"]} + # + # - feature: memory.numa + # matchExpressions: + # node_count: {op: Gt, value: ["2"]} + # - feature: memory.nv + # matchExpressions: + # devtype: {op: In, value: ["nd_dax"]} + # mode: {op: In, value: ["memory"]} + # + # - feature: system.osrelease + # matchExpressions: + # ID: {op: In, value: ["fedora", "centos"]} + # - feature: system.name + # matchExpressions: + # nodename: {op: InRegexp, value: ["^worker-X"]} + # + # - feature: local.label + # matchExpressions: + # custom-feature-knob: {op: Gt, value: ["100"]} + # + # # The following feature demonstrates the capabilities of the matchAny + # - name: "my matchAny rule" + # labels: + # "vendor.io/my-ng-feature-2": "my-value" + # # matchAny implements a logical IF over all elements (sub-matchers) in + # # the list (i.e. at least one feature matcher must match) + # matchAny: + # - matchFeatures: + # - feature: kernel.loadedmodule + # matchExpressions: + # driver-module-X: {op: Exists} + # - feature: pci.device + # matchExpressions: + # vendor: {op: In, value: ["8086"]} + # class: {op: In, value: ["0200"]} + # - matchFeatures: + # - feature: kernel.loadedmodule + # matchExpressions: + # driver-module-Y: {op: Exists} + # - feature: usb.device + # matchExpressions: + # vendor: {op: In, value: ["8086"]} + # class: {op: In, value: ["02"]} + # + # - name: "avx wildcard rule" + # labels: + # "my-avx-feature": "true" + # matchFeatures: + # - feature: cpu.cpuid + # matchName: {op: InRegexp, value: ["^AVX512"]} + # + # # The following features demonstreate label templating capabilities + # - name: "my template rule" + # labelsTemplate: | + # {{ range .system.osrelease }}vendor.io/my-system-feature.{{ .Name }}={{ .Value }} + # {{ end }} + # matchFeatures: + # - feature: system.osrelease + # matchExpressions: + # ID: {op: InRegexp, value: ["^open.*"]} + # VERSION_ID.major: {op: In, value: ["13", "15"]} + # + # - name: "my template rule 2" + # labelsTemplate: | + # {{ range .pci.device }}vendor.io/my-pci-device.{{ .class }}-{{ .device }}=with-cpuid + # {{ end }} + # matchFeatures: + # - feature: pci.device + # matchExpressions: + # class: {op: InRegexp, value: ["^06"]} + # vendor: ["8086"] + # - feature: cpu.cpuid + # matchExpressions: + # AVX: {op: Exists} + # + # # The following examples demonstrate vars field and back-referencing + # # previous labels and vars + # - name: "my dummy kernel rule" + # labels: + # "vendor.io/my.kernel.feature": "true" + # matchFeatures: + # - feature: kernel.version + # matchExpressions: + # major: {op: Gt, value: ["2"]} + # + # - name: "my dummy rule with no labels" + # vars: + # "my.dummy.var": "1" + # matchFeatures: + # - feature: cpu.cpuid + # matchExpressions: {} + # + # - name: "my rule using backrefs" + # labels: + # "vendor.io/my.backref.feature": "true" + # matchFeatures: + # - feature: rule.matched + # matchExpressions: + # vendor.io/my.kernel.feature: {op: IsTrue} + # my.dummy.var: {op: Gt, value: ["0"]} + # + # - name: "kconfig template rule" + # labelsTemplate: | + # {{ range .kernel.config }}kconfig-{{ .Name }}={{ .Value }} + # {{ end }} + # matchFeatures: + # - feature: kernel.config + # matchName: {op: In, value: ["SWAP", "X86", "ARM"]} +### + + metricsPort: 8081 + healthPort: 8082 + daemonsetAnnotations: {} + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + # runAsUser: 1000 + + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + # failureThreshold: 3 + # periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + failureThreshold: 10 + # periodSeconds: 10 + + serviceAccount: + # Specifies whether a service account should be created. + # We create this by default to make it easier for downstream users to apply PodSecurityPolicies. + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # specify how many old ControllerRevisions for the DaemonSet to retain. + revisionHistoryLimit: + + rbac: + create: true + + # Allow users to mount the hostPath /usr/src, useful for RHCOS on s390x + # Does not work on systems without /usr/src AND a read-only /usr, such as Talos + mountUsrSrc: false + + resources: + limits: + memory: 512Mi + requests: + cpu: 5m + memory: 64Mi + + nodeSelector: {} + + tolerations: [] + + annotations: {} + + affinity: {} + + priorityClassName: "" + +topologyUpdater: + config: ### + ## key = node name, value = list of resources to be excluded. + ## use * to exclude from all nodes. + ## an example for how the exclude list should looks like + #excludeList: + # node1: [cpu] + # node2: [memory, example/deviceA] + # *: [hugepages-2Mi] +### + + enable: false + createCRDs: false + extraEnvs: [] + hostNetwork: false + + serviceAccount: + create: true + annotations: {} + name: + + # specify how many old ControllerRevisions for the DaemonSet to retain. + revisionHistoryLimit: + + rbac: + create: true + + metricsPort: 8081 + healthPort: 8082 + kubeletConfigPath: + kubeletPodResourcesSockPath: + updateInterval: 60s + watchNamespace: "*" + kubeletStateDir: /var/lib/kubelet + + podSecurityContext: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsUser: 0 + + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + # failureThreshold: 3 + # periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + failureThreshold: 10 + # periodSeconds: 10 + + resources: + limits: + memory: 60Mi + requests: + cpu: 50m + memory: 40Mi + + nodeSelector: {} + tolerations: [] + annotations: {} + daemonsetAnnotations: {} + affinity: {} + podSetFingerprint: true + +gc: + enable: true + extraEnvs: [] + hostNetwork: false + replicaCount: 1 + + serviceAccount: + create: true + annotations: {} + name: + rbac: + create: true + + interval: 1h + + podSecurityContext: {} + + resources: + limits: + memory: 1Gi + requests: + cpu: 10m + memory: 128Mi + + metricsPort: 8081 + + nodeSelector: {} + tolerations: [] + annotations: {} + deploymentAnnotations: {} + affinity: {} + + # specify how many old ReplicaSets for the Deployment to retain. + revisionHistoryLimit: + +# Optionally use encryption for worker <--> master comms +# TODO: verify hostname is not yet supported +# +# If you do not enable certManager (and have it installed) you will +# need to manually, or otherwise, provision the TLS certs as secrets +tls: + enable: false + certManager: false + certManagerCertificate: + issuerKind: + issuerName: + +prometheus: + enable: false + scrapeInterval: 10s + labels: {} diff --git a/charts/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/charts/gpu-operator/crds/nvidia.com_clusterpolicies.yaml new file mode 100644 index 0000000..8ee8e9a --- /dev/null +++ b/charts/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -0,0 +1,2384 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: clusterpolicies.nvidia.com +spec: + group: nvidia.com + names: + kind: ClusterPolicy + listKind: ClusterPolicyList + plural: clusterpolicies + singular: clusterpolicy + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: string + name: v1 + schema: + openAPIV3Schema: + description: ClusterPolicy is the Schema for the clusterpolicies API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ClusterPolicySpec defines the desired state of ClusterPolicy + properties: + ccManager: + description: CCManager component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + defaultMode: + description: Default CC mode setting for compatible GPUs on the + node + enum: + - "on" + - "off" + - devtools + type: string + enabled: + description: Enabled indicates if deployment of CC Manager is + enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: CC Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: CC Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: CC Manager image tag + type: string + type: object + cdi: + description: CDI configures how the Container Device Interface is + used in the cluster + properties: + default: + default: false + description: Default indicates whether to use CDI as the default + mechanism for providing GPU access to containers. + type: boolean + enabled: + default: false + description: Enabled indicates whether CDI can be used to make + GPUs accessible to containers. + type: boolean + type: object + daemonsets: + description: Daemonset defines common configuration for all Daemonsets + properties: + annotations: + additionalProperties: + type: string + description: |- + Optional: Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + type: object + labels: + additionalProperties: + type: string + description: |- + Optional: Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + type: object + priorityClassName: + type: string + rollingUpdate: + description: 'Optional: Configuration for rolling update of all + DaemonSet pods' + properties: + maxUnavailable: + type: string + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + updateStrategy: + default: RollingUpdate + enum: + - RollingUpdate + - OnDelete + type: string + type: object + dcgm: + description: DCGM component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA DCGM Hostengine + as a separate pod is enabled. + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + hostPort: + description: 'Deprecated: HostPort represents host port that needs + to be bound for DCGM engine (Default: 5555)' + format: int32 + type: integer + image: + description: NVIDIA DCGM image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DCGM image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA DCGM image tag + type: string + type: object + dcgmExporter: + description: DCGMExporter spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: 'Optional: Custom metrics configuration for NVIDIA + DCGM Exporter' + properties: + name: + description: ConfigMap name with file dcgm-metrics.csv for + metrics to be collected by NVIDIA DCGM Exporter + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA DCGM Exporter + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA DCGM Exporter image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DCGM Exporter image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + serviceMonitor: + description: 'Optional: ServiceMonitor configuration for NVIDIA + DCGM Exporter' + properties: + additionalLabels: + additionalProperties: + type: string + description: AdditionalLabels to add to ServiceMonitor instance + for NVIDIA DCGM Exporter + type: object + enabled: + description: Enabled indicates if ServiceMonitor is deployed + for NVIDIA DCGM Exporter + type: boolean + honorLabels: + description: HonorLabels chooses the metric’s labels on collisions + with target labels. + type: boolean + interval: + description: |- + Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used. + Supported units: y, w, d, h, m, s, ms + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + relabelings: + description: Relabelings allows to rewrite labels on metric + sets for NVIDIA DCGM Exporter + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + Action to perform based on the regex matching. + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + Modulus to take of the hash of the source label values. + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. + type: string + replacement: + description: |- + Replacement value against which a Replace action is performed if the + regular expression matches. + + Regex capture groups are available. + type: string + separator: + description: Separator is the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + The source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name which may only contain ASCII + letters, numbers, as well as underscores. + pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$ + type: string + type: array + targetLabel: + description: |- + Label to which the resulting string is written in a replacement. + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + Regex capture groups are available. + type: string + type: object + type: array + type: object + version: + description: NVIDIA DCGM Exporter image tag + type: string + type: object + devicePlugin: + description: DevicePlugin component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: 'Optional: Configuration for the NVIDIA Device Plugin + via the ConfigMap' + properties: + default: + description: Default config name within the ConfigMap for + the NVIDIA Device Plugin config + type: string + name: + description: ConfigMap name for NVIDIA Device Plugin config + including shared config between plugin and GFD + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA Device + Plugin through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Device Plugin image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + mps: + description: 'Optional: MPS related configuration for the NVIDIA + Device Plugin' + properties: + root: + default: /run/nvidia/mps + description: Root defines the MPS root path on the host + type: string + type: object + repository: + description: NVIDIA Device Plugin image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA Device Plugin image tag + type: string + type: object + driver: + description: Driver component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + certConfig: + description: 'Optional: Custom certificates configuration for + NVIDIA Driver container' + properties: + name: + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA Driver + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + kernelModuleConfig: + description: 'Optional: Kernel module configuration parameters + for the NVIDIA Driver' + properties: + name: + type: string + type: object + licensingConfig: + description: 'Optional: Licensing configuration for NVIDIA vGPU + licensing' + properties: + configMapName: + type: string + nlsEnabled: + description: NLSEnabled indicates if NVIDIA Licensing System + is used for licensing. + type: boolean + type: object + livenessProbe: + description: NVIDIA Driver container liveness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + manager: + description: Manager represents configuration for NVIDIA Driver + Manager initContainer + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image + name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository + path + type: string + version: + description: Version represents NVIDIA Driver Manager image + tag(version) + type: string + type: object + rdma: + description: GPUDirectRDMASpec defines the properties for nvidia-peermem + deployment + properties: + enabled: + description: Enabled indicates if GPUDirect RDMA is enabled + through GPU operator + type: boolean + useHostMofed: + description: UseHostMOFED indicates to use MOFED drivers directly + installed on the host to enable GPUDirect RDMA + type: boolean + type: object + readinessProbe: + description: NVIDIA Driver container readiness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + repoConfig: + description: 'Optional: Custom repo configuration for NVIDIA Driver + container' + properties: + configMapName: + type: string + type: object + repository: + description: NVIDIA Driver image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + startupProbe: + description: NVIDIA Driver container startup probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + upgradePolicy: + description: Driver auto-upgrade settings + properties: + autoUpgrade: + default: false + description: |- + AutoUpgrade is a global switch for automatic upgrade feature + if set to false all other options are ignored + type: boolean + drain: + description: DrainSpec describes configuration for node drain + during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + enable: + default: false + description: Enable indicates if node draining is allowed + during upgrade + type: boolean + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time + in seconds to wait before giving up drain, zero means + infinite + minimum: 0 + type: integer + type: object + maxParallelUpgrades: + default: 1 + description: |- + MaxParallelUpgrades indicates how many nodes can be upgraded in parallel + 0 means no limit, all nodes will be upgraded in parallel + minimum: 0 + type: integer + maxUnavailable: + anyOf: + - type: integer + - type: string + default: 25% + description: |- + MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade. + Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%). + Absolute number is calculated from percentage by rounding up. + By default, a fixed value of 25% is used. + x-kubernetes-int-or-string: true + podDeletion: + description: PodDeletionSpec describes configuration for deletion + of pods using special resources during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the pod is deleted) + type: boolean + force: + default: false + description: Force indicates if force deletion is allowed + type: boolean + timeoutSeconds: + default: 300 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + waitForCompletion: + description: WaitForCompletionSpec describes the configuration + for waiting on job completions + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + type: object + useNvidiaDriverCRD: + description: UseNvidiaDriverCRD indicates if the deployment of + NVIDIA Driver is managed by the NVIDIADriver CRD type + type: boolean + useOpenKernelModules: + description: UseOpenKernelModules indicates if the open GPU kernel + modules should be used + type: boolean + usePrecompiled: + description: UsePrecompiled indicates if deployment of NVIDIA + Driver using pre-compiled modules is enabled + type: boolean + version: + description: NVIDIA Driver image tag + type: string + virtualTopology: + description: 'Optional: Virtual Topology Daemon configuration + for NVIDIA vGPU drivers' + properties: + config: + description: 'Optional: Config name representing virtual topology + daemon configuration file nvidia-topologyd.conf' + type: string + type: object + type: object + gdrcopy: + description: GDRCopy component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GDRCopy is enabled through GPU + Operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA GDRCopy driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA GDRCopy driver image repository + type: string + version: + description: NVIDIA GDRCopy driver image tag + type: string + type: object + gds: + description: GPUDirectStorage defines the spec for GDS components(Experimental) + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GPUDirect Storage is enabled + through GPU operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA GPUDirect Storage Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA GPUDirect Storage Driver image repository + type: string + version: + description: NVIDIA GPUDirect Storage Driver image tag + type: string + type: object + gfd: + description: GPUFeatureDiscovery spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of GPU Feature Discovery + Plugin is enabled. + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: GFD image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: GFD image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: GFD image tag + type: string + type: object + hostPaths: + description: HostPaths defines various paths on the host needed by + GPU Operator components + properties: + driverInstallDir: + description: |- + DriverInstallDir represents the root at which driver files including libraries, + config files, and executables can be found. + type: string + rootFS: + description: |- + RootFS represents the path to the root filesystem of the host. + This is used by components that need to interact with the host filesystem + and as such this must be a chroot-able filesystem. + Examples include the MIG Manager and Toolkit Container which may need to + stop, start, or restart systemd services. + type: string + type: object + kataManager: + description: KataManager component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: Kata Manager config + properties: + artifactsDir: + default: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + description: |- + ArtifactsDir is the directory where kata artifacts (e.g. kernel / guest images, configuration, etc.) + are placed on the local filesystem. + type: string + runtimeClasses: + description: RuntimeClasses is a list of kata runtime classes + to configure. + items: + description: RuntimeClass defines the configuration for + a kata RuntimeClass + properties: + artifacts: + description: Artifacts are the kata artifacts associated + with the runtime class. + properties: + pullSecret: + description: PullSecret is the secret used to pull + the OCI artifact. + type: string + url: + description: |- + URL is the path to the OCI artifact (payload) containing all artifacts + associated with a kata runtime class. + type: string + required: + - url + type: object + name: + description: Name is the name of the kata runtime class. + type: string + nodeSelector: + additionalProperties: + type: string + description: |- + NodeSelector specifies the nodeSelector for the RuntimeClass object. + This ensures pods running with the RuntimeClass only get scheduled + onto nodes which support it. + type: object + required: + - artifacts + - name + type: object + type: array + type: object + enabled: + description: Enabled indicates if deployment of Kata Manager is + enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Kata Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Kata Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: Kata Manager image tag + type: string + type: object + mig: + description: MIG spec + properties: + strategy: + description: 'Optional: MIGStrategy to apply for GFD and NVIDIA + Device Plugin' + enum: + - none + - single + - mixed + type: string + type: object + migManager: + description: MIGManager for configuration to deploy MIG Manager + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: 'Optional: Custom mig-parted configuration for NVIDIA + MIG Manager container' + properties: + default: + default: all-disabled + description: Default MIG config to be applied on the node, + when there is no config specified with the node label nvidia.com/mig.config + enum: + - all-disabled + - "" + type: string + name: + default: default-mig-parted-config + description: ConfigMap name + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA MIG Manager + is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + gpuClientsConfig: + description: 'Optional: Custom gpu-clients configuration for NVIDIA + MIG Manager container' + properties: + name: + description: ConfigMap name + type: string + type: object + image: + description: NVIDIA MIG Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA MIG Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA MIG Manager image tag + type: string + type: object + nodeStatusExporter: + description: NodeStatusExporter spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of Node Status Exporter + is enabled. + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Node Status Exporter image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Node Status Exporterimage repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: Node Status Exporterimage tag + type: string + type: object + operator: + description: Operator component spec + properties: + annotations: + additionalProperties: + type: string + description: |- + Optional: Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + type: object + defaultRuntime: + default: docker + description: Runtime defines container runtime type + enum: + - docker + - crio + - containerd + type: string + initContainer: + description: InitContainerSpec describes configuration for initContainer + image used with all components + properties: + image: + description: Image represents image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents image repository path + type: string + version: + description: Version represents image tag(version) + type: string + type: object + labels: + additionalProperties: + type: string + description: |- + Optional: Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + type: object + runtimeClass: + default: nvidia + type: string + use_ocp_driver_toolkit: + description: UseOpenShiftDriverToolkit indicates if DriverToolkit + image should be used on OpenShift to build and install driver + modules + type: boolean + required: + - defaultRuntime + type: object + psa: + description: PSA defines spec for PodSecurityAdmission configuration + properties: + enabled: + description: Enabled indicates if PodSecurityAdmission configuration + needs to be enabled for all Pods + type: boolean + type: object + psp: + description: |- + Deprecated: Pod Security Policies are no longer supported. Please use PodSecurityAdmission instead + PSP defines spec for handling PodSecurityPolicies + properties: + enabled: + description: Enabled indicates if PodSecurityPolicies needs to + be enabled for all Pods + type: boolean + type: object + sandboxDevicePlugin: + description: SandboxDevicePlugin component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA Sandbox + Device Plugin through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Sandbox Device Plugin image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA Sandbox Device Plugin image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA Sandbox Device Plugin image tag + type: string + type: object + sandboxWorkloads: + description: SandboxWorkloads defines the spec for handling sandbox + workloads (i.e. Virtual Machines) + properties: + defaultWorkload: + default: container + description: |- + DefaultWorkload indicates the default GPU workload type to configure + worker nodes in the cluster for + enum: + - container + - vm-passthrough + - vm-vgpu + type: string + enabled: + description: |- + Enabled indicates if the GPU Operator should manage additional operands required + for sandbox workloads (i.e. VFIO Manager, vGPU Manager, and additional device plugins) + type: boolean + type: object + toolkit: + description: Toolkit component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA Container + Toolkit through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Container Toolkit image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + installDir: + default: /usr/local/nvidia + description: Toolkit install directory on the host + type: string + repository: + description: NVIDIA Container Toolkit image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA Container Toolkit image tag + type: string + type: object + validator: + description: Validator defines the spec for operator-validator daemonset + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + cuda: + description: CUDA validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + driver: + description: Toolkit validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Validator image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + plugin: + description: Plugin validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + repository: + description: Validator image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + toolkit: + description: Toolkit validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + version: + description: Validator image tag + type: string + vfioPCI: + description: VfioPCI validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + vgpuDevices: + description: VGPUDevices validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + vgpuManager: + description: VGPUManager validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + type: object + vfioManager: + description: VFIOManager for configuration to deploy VFIO-PCI Manager + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + driverManager: + description: DriverManager represents configuration for NVIDIA + Driver Manager + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image + name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository + path + type: string + version: + description: Version represents NVIDIA Driver Manager image + tag(version) + type: string + type: object + enabled: + description: Enabled indicates if deployment of VFIO Manager is + enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: VFIO Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: VFIO Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: VFIO Manager image tag + type: string + type: object + vgpuDeviceManager: + description: VGPUDeviceManager spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: NVIDIA vGPU devices configuration for NVIDIA vGPU + Device Manager container + properties: + default: + default: default + description: Default config name within the ConfigMap + type: string + name: + description: ConfigMap name + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA vGPU Device + Manager is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA vGPU Device Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA vGPU Device Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA vGPU Device Manager image tag + type: string + type: object + vgpuManager: + description: VGPUManager component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + driverManager: + description: DriverManager represents configuration for NVIDIA + Driver Manager initContainer + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image + name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository + path + type: string + version: + description: Version represents NVIDIA Driver Manager image + tag(version) + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA vGPU Manager + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA vGPU Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA vGPU Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA vGPU Manager image tag + type: string + type: object + required: + - daemonsets + - dcgm + - dcgmExporter + - devicePlugin + - driver + - gfd + - nodeStatusExporter + - operator + - toolkit + type: object + status: + description: ClusterPolicyStatus defines the observed state of ClusterPolicy + properties: + conditions: + description: Conditions is a list of conditions representing the ClusterPolicy's + current state. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + namespace: + description: Namespace indicates a namespace in which the operator + is installed + type: string + state: + description: State indicates status of ClusterPolicy + enum: + - ignored + - ready + - notReady + type: string + required: + - state + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/charts/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml new file mode 100644 index 0000000..0721557 --- /dev/null +++ b/charts/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -0,0 +1,797 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: nvidiadrivers.nvidia.com +spec: + group: nvidia.com + names: + kind: NVIDIADriver + listKind: NVIDIADriverList + plural: nvidiadrivers + shortNames: + - nvd + - nvdriver + - nvdrivers + singular: nvidiadriver + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: NVIDIADriver is the Schema for the nvidiadrivers API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NVIDIADriverSpec defines the desired state of NVIDIADriver + properties: + annotations: + additionalProperties: + type: string + description: |- + Optional: Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + type: object + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + certConfig: + description: 'Optional: Custom certificates configuration for NVIDIA + Driver container' + properties: + name: + type: string + type: object + driverType: + default: gpu + description: DriverType defines NVIDIA driver type + enum: + - gpu + - vgpu + - vgpu-host-manager + type: string + x-kubernetes-validations: + - message: driverType is an immutable field. Please create a new NvidiaDriver + resource instead when you want to change this setting. + rule: self == oldSelf + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present in + a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + gdrcopy: + description: GDRCopy defines the spec for GDRCopy driver + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GDRCopy is enabled through GPU + operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: GDRCopy driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: GDRCopy diver image repository + type: string + version: + description: GDRCopy driver image tag + type: string + type: object + gds: + description: GPUDirectStorage defines the spec for GDS driver + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GPUDirect Storage is enabled + through GPU operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA GPUDirect Storage Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA GPUDirect Storage Driver image repository + type: string + version: + description: NVIDIA GPUDirect Storage Driver image tag + type: string + type: object + image: + default: nvcr.io/nvidia/driver + description: NVIDIA Driver container image name + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + kernelModuleConfig: + description: 'Optional: Kernel module configuration parameters for + the NVIDIA Driver' + properties: + name: + type: string + type: object + labels: + additionalProperties: + type: string + description: |- + Optional: Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + type: object + licensingConfig: + description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' + properties: + name: + type: string + nlsEnabled: + description: NLSEnabled indicates if NVIDIA Licensing System is + used for licensing. + type: boolean + type: object + livenessProbe: + description: NVIDIA Driver container liveness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + manager: + description: Manager represents configuration for NVIDIA Driver Manager + initContainer + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository path + type: string + version: + description: Version represents NVIDIA Driver Manager image tag(version) + type: string + type: object + nodeAffinity: + description: Affinity specifies node affinity rules for driver pods + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: |- + The scheduler will prefer to schedule pods to nodes that satisfy + the affinity expressions specified by this field, but it may choose + a node that violates one or more of the expressions. The node that is + most preferred is the one with the greatest sum of weights, i.e. + for each node that meets all of the scheduling requirements (resource + request, requiredDuringScheduling affinity expressions, etc.), + compute a sum by iterating through the elements of this field and adding + "weight" to the sum if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: |- + An empty preferred scheduling term matches all objects with implicit weight 0 + (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). + properties: + preference: + description: A node selector term, associated with the corresponding + weight. + properties: + matchExpressions: + description: A list of node selector requirements by + node's labels. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + description: A list of node selector requirements by + node's fields. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - preference + - weight + type: object + type: array + x-kubernetes-list-type: atomic + requiredDuringSchedulingIgnoredDuringExecution: + description: |- + If the affinity requirements specified by this field are not met at + scheduling time, the pod will not be scheduled onto the node. + If the affinity requirements specified by this field cease to be met + at some point during pod execution (e.g. due to an update), the system + may or may not try to eventually evict the pod from its node. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The + terms are ORed. + items: + description: |- + A null or empty node selector term matches no objects. The requirements of + them are ANDed. + The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements by + node's labels. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + description: A list of node selector requirements by + node's fields. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + type: object + nodeSelector: + additionalProperties: + type: string + description: NodeSelector specifies a selector for installation of + NVIDIA driver + type: object + priorityClassName: + description: 'Optional: Set priorityClassName' + type: string + rdma: + description: GPUDirectRDMA defines the spec for NVIDIA Peer Memory + driver + properties: + enabled: + description: Enabled indicates if GPUDirect RDMA is enabled through + GPU operator + type: boolean + useHostMofed: + description: UseHostMOFED indicates to use MOFED drivers directly + installed on the host to enable GPUDirect RDMA + type: boolean + type: object + readinessProbe: + description: NVIDIA Driver container readiness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + repoConfig: + description: 'Optional: Custom repo configuration for NVIDIA Driver + container' + properties: + name: + type: string + type: object + repository: + description: NVIDIA Driver repository + type: string + resources: + description: 'Optional: Define resources requests and limits for each + pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + startupProbe: + description: NVIDIA Driver container startup probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + useOpenKernelModules: + description: UseOpenKernelModules indicates if the open GPU kernel + modules should be used + type: boolean + usePrecompiled: + description: UsePrecompiled indicates if deployment of NVIDIA Driver + using pre-compiled modules is enabled + type: boolean + x-kubernetes-validations: + - message: usePrecompiled is an immutable field. Please create a new + NvidiaDriver resource instead when you want to change this setting. + rule: self == oldSelf + version: + description: NVIDIA Driver version (or just branch for precompiled + drivers) + type: string + virtualTopologyConfig: + description: 'Optional: Virtual Topology Daemon configuration for + NVIDIA vGPU drivers' + properties: + name: + description: 'Optional: Config name representing virtual topology + daemon configuration file nvidia-topologyd.conf' + type: string + type: object + required: + - driverType + - image + type: object + status: + description: NVIDIADriverStatus defines the observed state of NVIDIADriver + properties: + conditions: + description: Conditions is a list of conditions representing the NVIDIADriver's + current state. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + namespace: + description: Namespace indicates a namespace in which the operator + and driver are installed + type: string + state: + description: |- + INSERT ADDITIONAL STATUS FIELD - define observed state of cluster + Important: Run "make" to regenerate code after modifying this file + State indicates status of NVIDIADriver instance + enum: + - ignored + - ready + - notReady + type: string + required: + - state + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/gpu-operator/templates/_helpers.tpl b/charts/gpu-operator/templates/_helpers.tpl new file mode 100644 index 0000000..305c9d1 --- /dev/null +++ b/charts/gpu-operator/templates/_helpers.tpl @@ -0,0 +1,80 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "gpu-operator.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "gpu-operator.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "gpu-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} + +{{- define "gpu-operator.labels" -}} +app.kubernetes.io/name: {{ include "gpu-operator.name" . }} +helm.sh/chart: {{ include "gpu-operator.chart" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- if .Values.operator.labels }} +{{ toYaml .Values.operator.labels }} +{{- end }} +{{- end -}} + +{{- define "gpu-operator.operand-labels" -}} +helm.sh/chart: {{ include "gpu-operator.chart" . }} +app.kubernetes.io/managed-by: {{ include "gpu-operator.name" . }} +{{- if .Values.daemonsets.labels }} +{{ toYaml .Values.daemonsets.labels }} +{{- end }} +{{- end -}} + +{{- define "gpu-operator.matchLabels" -}} +app.kubernetes.io/name: {{ include "gpu-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Full image name with tag +*/}} +{{- define "gpu-operator.fullimage" -}} +{{- .Values.operator.repository -}}/{{- .Values.operator.image -}}:{{- .Values.operator.version | default .Chart.AppVersion -}} +{{- end }} + +{{/* +Full image name with tag +*/}} +{{- define "driver-manager.fullimage" -}} +{{- .Values.driver.manager.repository -}}/{{- .Values.driver.manager.image -}}:{{- .Values.driver.manager.version -}} +{{- end }} diff --git a/charts/gpu-operator/templates/cleanup_crd.yaml b/charts/gpu-operator/templates/cleanup_crd.yaml new file mode 100644 index 0000000..fd0c1b7 --- /dev/null +++ b/charts/gpu-operator/templates/cleanup_crd.yaml @@ -0,0 +1,45 @@ +{{- if .Values.operator.cleanupCRD }} +apiVersion: batch/v1 +kind: Job +metadata: + name: gpu-operator-cleanup-crd + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" +spec: + template: + metadata: + name: gpu-operator-cleanup-crd + labels: + {{- include "gpu-operator.labels" . | nindent 8 }} + app.kubernetes.io/component: "gpu-operator" + spec: + serviceAccountName: gpu-operator + {{- if .Values.operator.imagePullSecrets }} + imagePullSecrets: + {{- range .Values.operator.imagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + {{- with .Values.operator.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: cleanup-crd + image: {{ include "gpu-operator.fullimage" . }} + imagePullPolicy: {{ .Values.operator.imagePullPolicy }} + command: + - /bin/sh + - -c + - > + kubectl delete clusterpolicy cluster-policy; + kubectl delete crd clusterpolicies.nvidia.com; + + restartPolicy: OnFailure +{{- end }} diff --git a/charts/gpu-operator/templates/clusterpolicy.yaml b/charts/gpu-operator/templates/clusterpolicy.yaml new file mode 100644 index 0000000..af9e87c --- /dev/null +++ b/charts/gpu-operator/templates/clusterpolicy.yaml @@ -0,0 +1,683 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: cluster-policy + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" + {{- if .Values.operator.cleanupCRD }} + # CR cleanup is handled during pre-delete hook + # Add below annotation so that helm doesn't attempt to cleanup CR twice + annotations: + "helm.sh/resource-policy": keep + {{- end }} +spec: + hostPaths: + rootFS: {{ .Values.hostPaths.rootFS }} + driverInstallDir: {{ .Values.hostPaths.driverInstallDir }} + operator: + {{- if .Values.operator.defaultRuntime }} + defaultRuntime: {{ .Values.operator.defaultRuntime }} + {{- end }} + {{- if .Values.operator.runtimeClass }} + runtimeClass: {{ .Values.operator.runtimeClass }} + {{- end }} + {{- if .Values.operator.defaultGPUMode }} + defaultGPUMode: {{ .Values.operator.defaultGPUMode }} + {{- end }} + {{- if .Values.operator.initContainer }} + initContainer: + {{- if .Values.operator.initContainer.repository }} + repository: {{ .Values.operator.initContainer.repository }} + {{- end }} + {{- if .Values.operator.initContainer.image }} + image: {{ .Values.operator.initContainer.image }} + {{- end }} + {{- if .Values.operator.initContainer.version }} + version: {{ .Values.operator.initContainer.version | quote }} + {{- end }} + {{- if .Values.operator.initContainer.imagePullPolicy }} + imagePullPolicy: {{ .Values.operator.initContainer.imagePullPolicy }} + {{- end }} + {{- if .Values.operator.initContainer.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.operator.initContainer.imagePullSecrets | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.operator.use_ocp_driver_toolkit }} + use_ocp_driver_toolkit: {{ .Values.operator.use_ocp_driver_toolkit }} + {{- end }} + daemonsets: + labels: + {{- include "gpu-operator.operand-labels" . | nindent 6 }} + {{- if .Values.daemonsets.annotations }} + annotations: {{ toYaml .Values.daemonsets.annotations | nindent 6 }} + {{- end }} + {{- if .Values.daemonsets.tolerations }} + tolerations: {{ toYaml .Values.daemonsets.tolerations | nindent 6 }} + {{- end }} + {{- if .Values.daemonsets.priorityClassName }} + priorityClassName: {{ .Values.daemonsets.priorityClassName }} + {{- end }} + {{- if .Values.daemonsets.updateStrategy }} + updateStrategy: {{ .Values.daemonsets.updateStrategy }} + {{- end }} + {{- if .Values.daemonsets.rollingUpdate }} + rollingUpdate: + maxUnavailable: {{ .Values.daemonsets.rollingUpdate.maxUnavailable | quote }} + {{- end }} + validator: + {{- if .Values.validator.repository }} + repository: {{ .Values.validator.repository }} + {{- end }} + {{- if .Values.validator.image }} + image: {{ .Values.validator.image }} + {{- end }} + version: {{ .Values.validator.version | default .Chart.AppVersion | quote }} + {{- if .Values.validator.imagePullPolicy }} + imagePullPolicy: {{ .Values.validator.imagePullPolicy }} + {{- end }} + {{- if .Values.validator.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.validator.imagePullSecrets | nindent 8 }} + {{- end }} + {{- if .Values.validator.resources }} + resources: {{ toYaml .Values.validator.resources | nindent 6 }} + {{- end }} + {{- if .Values.validator.env }} + env: {{ toYaml .Values.validator.env | nindent 6 }} + {{- end }} + {{- if .Values.validator.args }} + args: {{ toYaml .Values.validator.args | nindent 6 }} + {{- end }} + {{- if .Values.validator.plugin }} + plugin: + {{- if .Values.validator.plugin.env }} + env: {{ toYaml .Values.validator.plugin.env | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.validator.cuda }} + cuda: + {{- if .Values.validator.cuda.env }} + env: {{ toYaml .Values.validator.cuda.env | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.validator.driver }} + driver: + {{- if .Values.validator.driver.env }} + env: {{ toYaml .Values.validator.driver.env | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.validator.toolkit }} + toolkit: + {{- if .Values.validator.toolkit.env }} + env: {{ toYaml .Values.validator.toolkit.env | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.validator.vfioPCI }} + vfioPCI: + {{- if .Values.validator.vfioPCI.env }} + env: {{ toYaml .Values.validator.vfioPCI.env | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.validator.vgpuManager }} + vgpuManager: + {{- if .Values.validator.vgpuManager.env }} + env: {{ toYaml .Values.validator.vgpuManager.env | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.validator.vgpuDevices }} + vgpuDevices: + {{- if .Values.validator.vgpuDevices.env }} + env: {{ toYaml .Values.validator.vgpuDevices.env | nindent 8 }} + {{- end }} + {{- end }} + + mig: + {{- if .Values.mig.strategy }} + strategy: {{ .Values.mig.strategy }} + {{- end }} + psa: + enabled: {{ .Values.psa.enabled }} + cdi: + enabled: {{ .Values.cdi.enabled }} + default: {{ .Values.cdi.default }} + driver: + enabled: {{ .Values.driver.enabled }} + useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }} + useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }} + usePrecompiled: {{ .Values.driver.usePrecompiled }} + {{- if .Values.driver.repository }} + repository: {{ .Values.driver.repository }} + {{- end }} + {{- if .Values.driver.image }} + image: {{ .Values.driver.image }} + {{- end }} + {{- if .Values.driver.version }} + version: {{ .Values.driver.version | quote }} + {{- end }} + {{- if .Values.driver.imagePullPolicy }} + imagePullPolicy: {{ .Values.driver.imagePullPolicy }} + {{- end }} + {{- if .Values.driver.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.driver.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.driver.startupProbe }} + startupProbe: {{ toYaml .Values.driver.startupProbe | nindent 6 }} + {{- end }} + {{- if .Values.driver.livenessProbe }} + livenessProbe: {{ toYaml .Values.driver.livenessProbe | nindent 6 }} + {{- end }} + {{- if .Values.driver.readinessProbe }} + readinessProbe: {{ toYaml .Values.driver.readinessProbe | nindent 6 }} + {{- end }} + rdma: + enabled: {{ .Values.driver.rdma.enabled }} + useHostMofed: {{ .Values.driver.rdma.useHostMofed }} + manager: + {{- if .Values.driver.manager.repository }} + repository: {{ .Values.driver.manager.repository }} + {{- end }} + {{- if .Values.driver.manager.image }} + image: {{ .Values.driver.manager.image }} + {{- end }} + {{- if .Values.driver.manager.version }} + version: {{ .Values.driver.manager.version | quote }} + {{- end }} + {{- if .Values.driver.manager.imagePullPolicy }} + imagePullPolicy: {{ .Values.driver.manager.imagePullPolicy }} + {{- end }} + {{- if .Values.driver.manager.env }} + env: {{ toYaml .Values.driver.manager.env | nindent 8 }} + {{- end }} + {{- if .Values.driver.repoConfig }} + repoConfig: {{ toYaml .Values.driver.repoConfig | nindent 6 }} + {{- end }} + {{- if .Values.driver.certConfig }} + certConfig: {{ toYaml .Values.driver.certConfig | nindent 6 }} + {{- end }} + {{- if .Values.driver.licensingConfig }} + licensingConfig: {{ toYaml .Values.driver.licensingConfig | nindent 6 }} + {{- end }} + {{- if .Values.driver.virtualTopology }} + virtualTopology: {{ toYaml .Values.driver.virtualTopology | nindent 6 }} + {{- end }} + {{- if .Values.driver.kernelModuleConfig }} + kernelModuleConfig: {{ toYaml .Values.driver.kernelModuleConfig | nindent 6 }} + {{- end }} + {{- if .Values.driver.resources }} + resources: {{ toYaml .Values.driver.resources | nindent 6 }} + {{- end }} + {{- if .Values.driver.env }} + env: {{ toYaml .Values.driver.env | nindent 6 }} + {{- end }} + {{- if .Values.driver.args }} + args: {{ toYaml .Values.driver.args | nindent 6 }} + {{- end }} + {{- if .Values.driver.upgradePolicy }} + upgradePolicy: + autoUpgrade: {{ .Values.driver.upgradePolicy.autoUpgrade | default false }} + maxParallelUpgrades: {{ .Values.driver.upgradePolicy.maxParallelUpgrades | default 0 }} + maxUnavailable : {{ .Values.driver.upgradePolicy.maxUnavailable | default "25%" }} + waitForCompletion: + timeoutSeconds: {{ .Values.driver.upgradePolicy.waitForCompletion.timeoutSeconds }} + {{- if .Values.driver.upgradePolicy.waitForCompletion.podSelector }} + podSelector: {{ .Values.driver.upgradePolicy.waitForCompletion.podSelector }} + {{- end }} + podDeletion: + force: {{ .Values.driver.upgradePolicy.gpuPodDeletion.force | default false }} + timeoutSeconds: {{ .Values.driver.upgradePolicy.gpuPodDeletion.timeoutSeconds }} + deleteEmptyDir: {{ .Values.driver.upgradePolicy.gpuPodDeletion.deleteEmptyDir | default false }} + drain: + enable: {{ .Values.driver.upgradePolicy.drain.enable | default false }} + force: {{ .Values.driver.upgradePolicy.drain.force | default false }} + {{- if .Values.driver.upgradePolicy.drain.podSelector }} + podSelector: {{ .Values.driver.upgradePolicy.drain.podSelector }} + {{- end }} + timeoutSeconds: {{ .Values.driver.upgradePolicy.drain.timeoutSeconds }} + deleteEmptyDir: {{ .Values.driver.upgradePolicy.drain.deleteEmptyDir | default false}} + {{- end }} + vgpuManager: + enabled: {{ .Values.vgpuManager.enabled }} + {{- if .Values.vgpuManager.repository }} + repository: {{ .Values.vgpuManager.repository }} + {{- end }} + {{- if .Values.vgpuManager.image }} + image: {{ .Values.vgpuManager.image }} + {{- end }} + {{- if .Values.vgpuManager.version }} + version: {{ .Values.vgpuManager.version | quote }} + {{- end }} + {{- if .Values.vgpuManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.vgpuManager.imagePullPolicy }} + {{- end }} + {{- if .Values.vgpuManager.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.vgpuManager.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.vgpuManager.resources }} + resources: {{ toYaml .Values.vgpuManager.resources | nindent 6 }} + {{- end }} + {{- if .Values.vgpuManager.env }} + env: {{ toYaml .Values.vgpuManager.env | nindent 6 }} + {{- end }} + {{- if .Values.vgpuManager.args }} + args: {{ toYaml .Values.vgpuManager.args | nindent 6 }} + {{- end }} + driverManager: + {{- if .Values.vgpuManager.driverManager.repository }} + repository: {{ .Values.vgpuManager.driverManager.repository }} + {{- end }} + {{- if .Values.vgpuManager.driverManager.image }} + image: {{ .Values.vgpuManager.driverManager.image }} + {{- end }} + {{- if .Values.vgpuManager.driverManager.version }} + version: {{ .Values.vgpuManager.driverManager.version | quote }} + {{- end }} + {{- if .Values.vgpuManager.driverManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.vgpuManager.driverManager.imagePullPolicy }} + {{- end }} + {{- if .Values.vgpuManager.driverManager.env }} + env: {{ toYaml .Values.vgpuManager.driverManager.env | nindent 8 }} + {{- end }} + kataManager: + enabled: {{ .Values.kataManager.enabled }} + config: {{ toYaml .Values.kataManager.config | nindent 6 }} + {{- if .Values.kataManager.repository }} + repository: {{ .Values.kataManager.repository }} + {{- end }} + {{- if .Values.kataManager.image }} + image: {{ .Values.kataManager.image }} + {{- end }} + {{- if .Values.kataManager.version }} + version: {{ .Values.kataManager.version | quote }} + {{- end }} + {{- if .Values.kataManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.kataManager.imagePullPolicy }} + {{- end }} + {{- if .Values.kataManager.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.kataManager.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.kataManager.resources }} + resources: {{ toYaml .Values.kataManager.resources | nindent 6 }} + {{- end }} + {{- if .Values.kataManager.env }} + env: {{ toYaml .Values.kataManager.env | nindent 6 }} + {{- end }} + {{- if .Values.kataManager.args }} + args: {{ toYaml .Values.kataManager.args | nindent 6 }} + {{- end }} + vfioManager: + enabled: {{ .Values.vfioManager.enabled }} + {{- if .Values.vfioManager.repository }} + repository: {{ .Values.vfioManager.repository }} + {{- end }} + {{- if .Values.vfioManager.image }} + image: {{ .Values.vfioManager.image }} + {{- end }} + {{- if .Values.vfioManager.version }} + version: {{ .Values.vfioManager.version | quote }} + {{- end }} + {{- if .Values.vfioManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.vfioManager.imagePullPolicy }} + {{- end }} + {{- if .Values.vfioManager.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.vfioManager.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.vfioManager.resources }} + resources: {{ toYaml .Values.vfioManager.resources | nindent 6 }} + {{- end }} + {{- if .Values.vfioManager.env }} + env: {{ toYaml .Values.vfioManager.env | nindent 6 }} + {{- end }} + {{- if .Values.vfioManager.args }} + args: {{ toYaml .Values.vfioManager.args | nindent 6 }} + {{- end }} + driverManager: + {{- if .Values.vfioManager.driverManager.repository }} + repository: {{ .Values.vfioManager.driverManager.repository }} + {{- end }} + {{- if .Values.vfioManager.driverManager.image }} + image: {{ .Values.vfioManager.driverManager.image }} + {{- end }} + {{- if .Values.vfioManager.driverManager.version }} + version: {{ .Values.vfioManager.driverManager.version | quote }} + {{- end }} + {{- if .Values.vfioManager.driverManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.vfioManager.driverManager.imagePullPolicy }} + {{- end }} + {{- if .Values.vfioManager.driverManager.env }} + env: {{ toYaml .Values.vfioManager.driverManager.env | nindent 8 }} + {{- end }} + vgpuDeviceManager: + enabled: {{ .Values.vgpuDeviceManager.enabled }} + {{- if .Values.vgpuDeviceManager.repository }} + repository: {{ .Values.vgpuDeviceManager.repository }} + {{- end }} + {{- if .Values.vgpuDeviceManager.image }} + image: {{ .Values.vgpuDeviceManager.image }} + {{- end }} + {{- if .Values.vgpuDeviceManager.version }} + version: {{ .Values.vgpuDeviceManager.version | quote }} + {{- end }} + {{- if .Values.vgpuDeviceManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.vgpuDeviceManager.imagePullPolicy }} + {{- end }} + {{- if .Values.vgpuDeviceManager.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.vgpuDeviceManager.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.vgpuDeviceManager.resources }} + resources: {{ toYaml .Values.vgpuDeviceManager.resources | nindent 6 }} + {{- end }} + {{- if .Values.vgpuDeviceManager.env }} + env: {{ toYaml .Values.vgpuDeviceManager.env | nindent 6 }} + {{- end }} + {{- if .Values.vgpuDeviceManager.args }} + args: {{ toYaml .Values.vgpuDeviceManager.args | nindent 6 }} + {{- end }} + {{- if .Values.vgpuDeviceManager.config }} + config: {{ toYaml .Values.vgpuDeviceManager.config | nindent 6 }} + {{- end }} + ccManager: + enabled: {{ .Values.ccManager.enabled }} + defaultMode: {{ .Values.ccManager.defaultMode | quote }} + {{- if .Values.ccManager.repository }} + repository: {{ .Values.ccManager.repository }} + {{- end }} + {{- if .Values.ccManager.image }} + image: {{ .Values.ccManager.image }} + {{- end }} + {{- if .Values.ccManager.version }} + version: {{ .Values.ccManager.version | quote }} + {{- end }} + {{- if .Values.ccManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.ccManager.imagePullPolicy }} + {{- end }} + {{- if .Values.ccManager.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.ccManager.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.ccManager.resources }} + resources: {{ toYaml .Values.ccManager.resources | nindent 6 }} + {{- end }} + {{- if .Values.ccManager.env }} + env: {{ toYaml .Values.vfioManager.env | nindent 6 }} + {{- end }} + {{- if .Values.ccManager.args }} + args: {{ toYaml .Values.ccManager.args | nindent 6 }} + {{- end }} + toolkit: + enabled: {{ .Values.toolkit.enabled }} + {{- if .Values.toolkit.repository }} + repository: {{ .Values.toolkit.repository }} + {{- end }} + {{- if .Values.toolkit.image }} + image: {{ .Values.toolkit.image }} + {{- end }} + {{- if .Values.toolkit.version }} + version: {{ .Values.toolkit.version | quote }} + {{- end }} + {{- if .Values.toolkit.imagePullPolicy }} + imagePullPolicy: {{ .Values.toolkit.imagePullPolicy }} + {{- end }} + {{- if .Values.toolkit.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.toolkit.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.toolkit.resources }} + resources: {{ toYaml .Values.toolkit.resources | nindent 6 }} + {{- end }} + {{- if .Values.toolkit.env }} + env: {{ toYaml .Values.toolkit.env | nindent 6 }} + {{- end }} + {{- if .Values.toolkit.installDir }} + installDir: {{ .Values.toolkit.installDir }} + {{- end }} + devicePlugin: + enabled: {{ .Values.devicePlugin.enabled }} + {{- if .Values.devicePlugin.repository }} + repository: {{ .Values.devicePlugin.repository }} + {{- end }} + {{- if .Values.devicePlugin.image }} + image: {{ .Values.devicePlugin.image }} + {{- end }} + {{- if .Values.devicePlugin.version }} + version: {{ .Values.devicePlugin.version | quote }} + {{- end }} + {{- if .Values.devicePlugin.imagePullPolicy }} + imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy }} + {{- end }} + {{- if .Values.devicePlugin.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.devicePlugin.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.devicePlugin.resources }} + resources: {{ toYaml .Values.devicePlugin.resources | nindent 6 }} + {{- end }} + {{- if .Values.devicePlugin.env }} + env: {{ toYaml .Values.devicePlugin.env | nindent 6 }} + {{- end }} + {{- if .Values.devicePlugin.args }} + args: {{ toYaml .Values.devicePlugin.args | nindent 6 }} + {{- end }} + {{- if .Values.devicePlugin.config.name }} + config: + name: {{ .Values.devicePlugin.config.name }} + default: {{ .Values.devicePlugin.config.default }} + {{- end }} + dcgm: + enabled: {{ .Values.dcgm.enabled }} + {{- if .Values.dcgm.repository }} + repository: {{ .Values.dcgm.repository }} + {{- end }} + {{- if .Values.dcgm.image }} + image: {{ .Values.dcgm.image }} + {{- end }} + {{- if .Values.dcgm.version }} + version: {{ .Values.dcgm.version | quote }} + {{- end }} + {{- if .Values.dcgm.imagePullPolicy }} + imagePullPolicy: {{ .Values.dcgm.imagePullPolicy }} + {{- end }} + {{- if .Values.dcgm.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.dcgm.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.dcgm.resources }} + resources: {{ toYaml .Values.dcgm.resources | nindent 6 }} + {{- end }} + {{- if .Values.dcgm.env }} + env: {{ toYaml .Values.dcgm.env | nindent 6 }} + {{- end }} + {{- if .Values.dcgm.args }} + args: {{ toYaml .Values.dcgm.args | nindent 6 }} + {{- end }} + dcgmExporter: + enabled: {{ .Values.dcgmExporter.enabled }} + {{- if .Values.dcgmExporter.repository }} + repository: {{ .Values.dcgmExporter.repository }} + {{- end }} + {{- if .Values.dcgmExporter.image }} + image: {{ .Values.dcgmExporter.image }} + {{- end }} + {{- if .Values.dcgmExporter.version }} + version: {{ .Values.dcgmExporter.version | quote }} + {{- end }} + {{- if .Values.dcgmExporter.imagePullPolicy }} + imagePullPolicy: {{ .Values.dcgmExporter.imagePullPolicy }} + {{- end }} + {{- if .Values.dcgmExporter.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.dcgmExporter.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.dcgmExporter.resources }} + resources: {{ toYaml .Values.dcgmExporter.resources | nindent 6 }} + {{- end }} + {{- if .Values.dcgmExporter.env }} + env: {{ toYaml .Values.dcgmExporter.env | nindent 6 }} + {{- end }} + {{- if .Values.dcgmExporter.args }} + args: {{ toYaml .Values.dcgmExporter.args | nindent 6 }} + {{- end }} + {{- if and (.Values.dcgmExporter.config) (.Values.dcgmExporter.config.name) }} + config: + name: {{ .Values.dcgmExporter.config.name }} + {{- end }} + {{- if .Values.dcgmExporter.serviceMonitor }} + serviceMonitor: {{ toYaml .Values.dcgmExporter.serviceMonitor | nindent 6 }} + {{- end }} + gfd: + enabled: {{ .Values.gfd.enabled }} + {{- if .Values.gfd.repository }} + repository: {{ .Values.gfd.repository }} + {{- end }} + {{- if .Values.gfd.image }} + image: {{ .Values.gfd.image }} + {{- end }} + {{- if .Values.gfd.version }} + version: {{ .Values.gfd.version | quote }} + {{- end }} + {{- if .Values.gfd.imagePullPolicy }} + imagePullPolicy: {{ .Values.gfd.imagePullPolicy }} + {{- end }} + {{- if .Values.gfd.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.gfd.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.gfd.resources }} + resources: {{ toYaml .Values.gfd.resources | nindent 6 }} + {{- end }} + {{- if .Values.gfd.env }} + env: {{ toYaml .Values.gfd.env | nindent 6 }} + {{- end }} + {{- if .Values.gfd.args }} + args: {{ toYaml .Values.gfd.args | nindent 6 }} + {{- end }} + migManager: + enabled: {{ .Values.migManager.enabled }} + {{- if .Values.migManager.repository }} + repository: {{ .Values.migManager.repository }} + {{- end }} + {{- if .Values.migManager.image }} + image: {{ .Values.migManager.image }} + {{- end }} + {{- if .Values.migManager.version }} + version: {{ .Values.migManager.version | quote }} + {{- end }} + {{- if .Values.migManager.imagePullPolicy }} + imagePullPolicy: {{ .Values.migManager.imagePullPolicy }} + {{- end }} + {{- if .Values.migManager.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.migManager.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.migManager.resources }} + resources: {{ toYaml .Values.migManager.resources | nindent 6 }} + {{- end }} + {{- if .Values.migManager.env }} + env: {{ toYaml .Values.migManager.env | nindent 6 }} + {{- end }} + {{- if .Values.migManager.args }} + args: {{ toYaml .Values.migManager.args | nindent 6 }} + {{- end }} + {{- if .Values.migManager.config }} + config: + name: {{ .Values.migManager.config.name }} + default: {{ .Values.migManager.config.default }} + {{- end }} + {{- if .Values.migManager.gpuClientsConfig }} + gpuClientsConfig: {{ toYaml .Values.migManager.gpuClientsConfig | nindent 6 }} + {{- end }} + nodeStatusExporter: + enabled: {{ .Values.nodeStatusExporter.enabled }} + {{- if .Values.nodeStatusExporter.repository }} + repository: {{ .Values.nodeStatusExporter.repository }} + {{- end }} + {{- if .Values.nodeStatusExporter.image }} + image: {{ .Values.nodeStatusExporter.image }} + {{- end }} + version: {{ .Values.nodeStatusExporter.version | default .Chart.AppVersion | quote }} + {{- if .Values.nodeStatusExporter.imagePullPolicy }} + imagePullPolicy: {{ .Values.nodeStatusExporter.imagePullPolicy }} + {{- end }} + {{- if .Values.nodeStatusExporter.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.nodeStatusExporter.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.nodeStatusExporter.resources }} + resources: {{ toYaml .Values.nodeStatusExporter.resources | nindent 6 }} + {{- end }} + {{- if .Values.nodeStatusExporter.env }} + env: {{ toYaml .Values.nodeStatusExporter.env | nindent 6 }} + {{- end }} + {{- if .Values.nodeStatusExporter.args }} + args: {{ toYaml .Values.nodeStatusExporter.args | nindent 6 }} + {{- end }} + {{- if .Values.gds.enabled }} + gds: + enabled: {{ .Values.gds.enabled }} + {{- if .Values.gds.repository }} + repository: {{ .Values.gds.repository }} + {{- end }} + {{- if .Values.gds.image }} + image: {{ .Values.gds.image }} + {{- end }} + version: {{ .Values.gds.version | quote }} + {{- if .Values.gds.imagePullPolicy }} + imagePullPolicy: {{ .Values.gds.imagePullPolicy }} + {{- end }} + {{- if .Values.gds.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.gds.imagePullSecrets | nindent 8 }} + {{- end }} + {{- if .Values.gds.env }} + env: {{ toYaml .Values.gds.env | nindent 6 }} + {{- end }} + {{- if .Values.gds.args }} + args: {{ toYaml .Values.gds.args | nindent 6 }} + {{- end }} + {{- end }} + {{- if .Values.gdrcopy }} + gdrcopy: + enabled: {{ .Values.gdrcopy.enabled | default false }} + {{- if .Values.gdrcopy.repository }} + repository: {{ .Values.gdrcopy.repository }} + {{- end }} + {{- if .Values.gdrcopy.image }} + image: {{ .Values.gdrcopy.image }} + {{- end }} + version: {{ .Values.gdrcopy.version | quote }} + {{- if .Values.gdrcopy.imagePullPolicy }} + imagePullPolicy: {{ .Values.gdrcopy.imagePullPolicy }} + {{- end }} + {{- if .Values.gdrcopy.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.gdrcopy.imagePullSecrets | nindent 8 }} + {{- end }} + {{- if .Values.gdrcopy.env }} + env: {{ toYaml .Values.gdrcopy.env | nindent 6 }} + {{- end }} + {{- if .Values.gdrcopy.args }} + args: {{ toYaml .Values.gdrcopy.args | nindent 6 }} + {{- end }} + {{- end }} + sandboxWorkloads: + enabled: {{ .Values.sandboxWorkloads.enabled }} + {{- if .Values.sandboxWorkloads.defaultWorkload }} + defaultWorkload: {{ .Values.sandboxWorkloads.defaultWorkload }} + {{- end }} + sandboxDevicePlugin: + {{- if .Values.sandboxDevicePlugin.enabled }} + enabled: {{ .Values.sandboxDevicePlugin.enabled }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.repository }} + repository: {{ .Values.sandboxDevicePlugin.repository }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.image }} + image: {{ .Values.sandboxDevicePlugin.image }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.version }} + version: {{ .Values.sandboxDevicePlugin.version | quote }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.imagePullPolicy }} + imagePullPolicy: {{ .Values.sandboxDevicePlugin.imagePullPolicy }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.sandboxDevicePlugin.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.resources }} + resources: {{ toYaml .Values.sandboxDevicePlugin.resources | nindent 6 }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.env }} + env: {{ toYaml .Values.sandboxDevicePlugin.env | nindent 6 }} + {{- end }} + {{- if .Values.sandboxDevicePlugin.args }} + args: {{ toYaml .Values.sandboxDevicePlugin.args | nindent 6 }} + {{- end }} diff --git a/charts/gpu-operator/templates/clusterrole.yaml b/charts/gpu-operator/templates/clusterrole.yaml new file mode 100644 index 0000000..4acbcf2 --- /dev/null +++ b/charts/gpu-operator/templates/clusterrole.yaml @@ -0,0 +1,146 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gpu-operator + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" +rules: +- apiGroups: + - config.openshift.io + resources: + - clusterversions + - proxies + verbs: + - get + - list + - watch +- apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - use +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - create + - watch + - update + - patch +- apiGroups: + - "" + resources: + - events + - pods + - pods/eviction + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch +- apiGroups: + - nvidia.com + resources: + - clusterpolicies + - clusterpolicies/finalizers + - clusterpolicies/status + - nvidiadrivers + - nvidiadrivers/finalizers + - nvidiadrivers/status + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - deletecollection +- apiGroups: + - scheduling.k8s.io + resources: + - priorityclasses + verbs: + - get + - list + - watch + - create +- apiGroups: + - node.k8s.io + resources: + - runtimeclasses + verbs: + - get + - list + - create + - update + - watch + - delete +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - update + - patch + - create +{{- if .Values.operator.cleanupCRD }} + - delete +{{- end }} diff --git a/charts/gpu-operator/templates/clusterrolebinding.yaml b/charts/gpu-operator/templates/clusterrolebinding.yaml new file mode 100644 index 0000000..08b87fb --- /dev/null +++ b/charts/gpu-operator/templates/clusterrolebinding.yaml @@ -0,0 +1,18 @@ +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: gpu-operator + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" +subjects: +- kind: ServiceAccount + name: gpu-operator + namespace: {{ $.Release.Namespace }} +- kind: ServiceAccount + name: node-feature-discovery + namespace: {{ $.Release.Namespace }} +roleRef: + kind: ClusterRole + name: gpu-operator + apiGroup: rbac.authorization.k8s.io diff --git a/charts/gpu-operator/templates/dcgm_exporter_config.yaml b/charts/gpu-operator/templates/dcgm_exporter_config.yaml new file mode 100644 index 0000000..c4bf6dc --- /dev/null +++ b/charts/gpu-operator/templates/dcgm_exporter_config.yaml @@ -0,0 +1,14 @@ +{{- if .Values.dcgmExporter.config }} +{{- if and (.Values.dcgmExporter.config.create) (not (empty .Values.dcgmExporter.config.data)) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.dcgmExporter.config.name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} +data: + dcgm-metrics.csv: | +{{- .Values.dcgmExporter.config.data | nindent 4 }} +{{- end }} +{{- end }} diff --git a/charts/gpu-operator/templates/mig_config.yaml b/charts/gpu-operator/templates/mig_config.yaml new file mode 100644 index 0000000..2ceb047 --- /dev/null +++ b/charts/gpu-operator/templates/mig_config.yaml @@ -0,0 +1,10 @@ +{{- if and (.Values.migManager.config.create) (not (empty .Values.migManager.config.data)) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.migManager.config.name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} +data: {{ toYaml .Values.migManager.config.data | nindent 2 }} +{{- end }} diff --git a/charts/gpu-operator/templates/nodefeaturerules.yaml b/charts/gpu-operator/templates/nodefeaturerules.yaml new file mode 100644 index 0000000..6076b3d --- /dev/null +++ b/charts/gpu-operator/templates/nodefeaturerules.yaml @@ -0,0 +1,107 @@ +{{- if .Values.nfd.nodefeaturerules }} +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: nvidia-nfd-nodefeaturerules +spec: + rules: + - name: "TDX rule" + labels: + tdx.enabled: "true" + matchFeatures: + - feature: cpu.security + matchExpressions: + tdx.enabled: {op: IsTrue} + - name: "TDX total keys rule" + extendedResources: + tdx.total_keys: "@cpu.security.tdx.total_keys" + matchFeatures: + - feature: cpu.security + matchExpressions: + tdx.enabled: {op: IsTrue} + - name: "SEV-SNP rule" + labels: + sev.snp.enabled: "true" + matchFeatures: + - feature: cpu.security + matchExpressions: + sev.snp.enabled: + op: IsTrue + - name: "SEV-ES rule" + labels: + sev.es.enabled: "true" + matchFeatures: + - feature: cpu.security + matchExpressions: + sev.es.enabled: + op: IsTrue + - name: SEV system capacities + extendedResources: + sev_asids: '@cpu.security.sev.asids' + sev_es: '@cpu.security.sev.encrypted_state_ids' + matchFeatures: + - feature: cpu.security + matchExpressions: + sev.enabled: + op: Exists + - name: "NVIDIA H100" + labels: + "nvidia.com/gpu.H100": "true" + "nvidia.com/gpu.family": "hopper" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["10de"]} + device: {op: In, value: ["2339"]} + - name: "NVIDIA H100 PCIe" + labels: + "nvidia.com/gpu.H100.pcie": "true" + "nvidia.com/gpu.family": "hopper" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["10de"]} + device: {op: In, value: ["2331"]} + - name: "NVIDIA H100 80GB HBM3" + labels: + "nvidia.com/gpu.H100.HBM3": "true" + "nvidia.com/gpu.family": "hopper" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["10de"]} + device: {op: In, value: ["2330"]} + - name: "NVIDIA H800" + labels: + "nvidia.com/gpu.H800": "true" + "nvidia.com/gpu.family": "hopper" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["10de"]} + device: {op: In, value: ["2324"]} + - name: "NVIDIA H800 PCIE" + labels: + "nvidia.com/gpu.H800.pcie": "true" + "nvidia.com/gpu.family": "hopper" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["10de"]} + device: {op: In, value: ["2322"]} + - name: "NVIDIA CC Enabled" + labels: + "nvidia.com/cc.capable": "true" + matchAny: # TDX/SEV + Hopper GPU + - matchFeatures: + - feature: rule.matched + matchExpressions: + nvidia.com/gpu.family: {op: In, value: ["hopper"]} + sev.snp.enabled: {op: IsTrue} + - matchFeatures: + - feature: rule.matched + matchExpressions: + nvidia.com/gpu.family: {op: In, value: ["hopper"]} + tdx.enabled: {op: IsTrue} +{{- end }} + diff --git a/charts/gpu-operator/templates/nvidiadriver.yaml b/charts/gpu-operator/templates/nvidiadriver.yaml new file mode 100644 index 0000000..31660c0 --- /dev/null +++ b/charts/gpu-operator/templates/nvidiadriver.yaml @@ -0,0 +1,119 @@ +{{- if and .Values.driver.nvidiaDriverCRD.enabled .Values.driver.nvidiaDriverCRD.deployDefaultCR }} +apiVersion: nvidia.com/v1alpha1 +kind: NVIDIADriver +metadata: + name: default +spec: + repository: {{ .Values.driver.repository }} + image: {{ .Values.driver.image }} + version: {{ .Values.driver.version }} + useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }} + usePrecompiled: {{ .Values.driver.usePrecompiled }} + driverType: {{ .Values.driver.nvidiaDriverCRD.driverType | default "gpu" }} + {{- if .Values.daemonsets.annotations }} + annotations: {{ toYaml .Values.daemonsets.annotations | nindent 6 }} + {{- end }} + {{- if .Values.daemonsets.labels }} + labels: {{ toYaml .Values.daemonsets.labels | nindent 6 }} + {{- end }} + {{- if .Values.driver.nvidiaDriverCRD.nodeSelector }} + nodeSelector: {{ toYaml .Values.driver.nvidiaDriverCRD.nodeSelector | nindent 6 }} + {{- end }} + {{- if .Values.driver.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.driver.imagePullSecrets | nindent 4 }} + {{- end }} + {{- if .Values.driver.manager }} + manager: {{ toYaml .Values.driver.manager | nindent 4 }} + {{- end }} + {{- if .Values.driver.startupProbe }} + startupProbe: {{ toYaml .Values.driver.startupProbe | nindent 4 }} + {{- end }} + {{- if .Values.driver.livenessProbe }} + livenessProbe: {{ toYaml .Values.driver.livenessProbe | nindent 4 }} + {{- end }} + {{- if .Values.driver.readinessProbe }} + readinessProbe: {{ toYaml .Values.driver.readinessProbe | nindent 4 }} + {{- end }} + rdma: + enabled: {{ .Values.driver.rdma.enabled }} + useHostMofed: {{ .Values.driver.rdma.useHostMofed }} + {{- if .Values.daemonsets.tolerations }} + tolerations: {{ toYaml .Values.daemonsets.tolerations | nindent 6 }} + {{- end }} + {{- if .Values.driver.repoConfig.configMapName }} + repoConfig: + name: {{ .Values.driver.repoConfig.configMapName }} + {{- end }} + {{- if .Values.driver.certConfig.name }} + certConfig: + name: {{ .Values.driver.certConfig.name }} + {{- end }} + {{- if .Values.driver.licensingConfig.configMapName }} + licensingConfig: + name: {{ .Values.driver.licensingConfig.configMapName }} + nlsEnabled: {{ .Values.driver.licensingConfig.nlsEnabled | default true }} + {{- end }} + {{- if .Values.driver.virtualTopology.config }} + virtualTopologyConfig: + name: {{ .Values.driver.virtualTopology.config }} + {{- end }} + {{- if .Values.driver.kernelModuleConfig.name }} + kernelModuleConfig: + name: {{ .Values.driver.kernelModuleConfig.name }} + {{- end }} + {{- if .Values.driver.resources }} + resources: {{ toYaml .Values.driver.resources | nindent 6 }} + {{- end }} + {{- if .Values.driver.env }} + env: {{ toYaml .Values.driver.env | nindent 6 }} + {{- end }} + {{- if .Values.driver.args }} + args: {{ toYaml .Values.driver.args | nindent 6 }} + {{- end }} + {{- if .Values.gds.enabled }} + gds: + enabled: {{ .Values.gds.enabled }} + {{- if .Values.gds.repository }} + repository: {{ .Values.gds.repository }} + {{- end }} + {{- if .Values.gds.image }} + image: {{ .Values.gds.image }} + {{- end }} + version: {{ .Values.gds.version | quote }} + {{- if .Values.gds.imagePullPolicy }} + imagePullPolicy: {{ .Values.gds.imagePullPolicy }} + {{- end }} + {{- if .Values.gds.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.gds.imagePullSecrets | nindent 8 }} + {{- end }} + {{- if .Values.gds.env }} + env: {{ toYaml .Values.gds.env | nindent 6 }} + {{- end }} + {{- if .Values.gds.args }} + args: {{ toYaml .Values.gds.args | nindent 6 }} + {{- end }} + {{- end }} + {{- if .Values.gdrcopy }} + gdrcopy: + enabled: {{ .Values.gdrcopy.enabled | default false }} + {{- if .Values.gdrcopy.repository }} + repository: {{ .Values.gdrcopy.repository }} + {{- end }} + {{- if .Values.gdrcopy.image }} + image: {{ .Values.gdrcopy.image }} + {{- end }} + version: {{ .Values.gdrcopy.version | quote }} + {{- if .Values.gdrcopy.imagePullPolicy }} + imagePullPolicy: {{ .Values.gdrcopy.imagePullPolicy }} + {{- end }} + {{- if .Values.gdrcopy.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.gdrcopy.imagePullSecrets | nindent 8 }} + {{- end }} + {{- if .Values.gdrcopy.env }} + env: {{ toYaml .Values.gdrcopy.env | nindent 6 }} + {{- end }} + {{- if .Values.gdrcopy.args }} + args: {{ toYaml .Values.gdrcopy.args | nindent 6 }} + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/gpu-operator/templates/operator.yaml b/charts/gpu-operator/templates/operator.yaml new file mode 100644 index 0000000..6f48482 --- /dev/null +++ b/charts/gpu-operator/templates/operator.yaml @@ -0,0 +1,99 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-operator + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" + nvidia.com/gpu-driver-upgrade-drain.skip: "true" +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: "gpu-operator" + app: "gpu-operator" + template: + metadata: + labels: + {{- include "gpu-operator.labels" . | nindent 8 }} + app.kubernetes.io/component: "gpu-operator" + app: "gpu-operator" + nvidia.com/gpu-driver-upgrade-drain.skip: "true" + annotations: + {{- toYaml .Values.operator.annotations | nindent 8 }} + spec: + serviceAccountName: gpu-operator + {{- if .Values.operator.imagePullSecrets }} + imagePullSecrets: + {{- range .Values.operator.imagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + {{- if .Values.operator.priorityClassName }} + priorityClassName: {{ .Values.operator.priorityClassName }} + {{- end }} + containers: + - name: gpu-operator + image: {{ include "gpu-operator.fullimage" . }} + imagePullPolicy: {{ .Values.operator.imagePullPolicy }} + command: ["gpu-operator"] + args: + - --leader-elect + {{- if .Values.operator.logging.develMode }} + - --zap-devel + {{- else }} + {{- if .Values.operator.logging.timeEncoding }} + - --zap-time-encoding={{- .Values.operator.logging.timeEncoding }} + {{- end }} + {{- if .Values.operator.logging.level }} + - --zap-log-level={{- .Values.operator.logging.level }} + {{- end }} + {{- end }} + env: + - name: WATCH_NAMESPACE + value: "" + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: "DRIVER_MANAGER_IMAGE" + value: "{{ include "driver-manager.fullimage" . }}" + volumeMounts: + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + {{- with .Values.operator.resources }} + resources: + {{- toYaml . | nindent 10 }} + {{- end }} + ports: + - name: metrics + containerPort: 8080 + volumes: + - name: host-os-release + hostPath: + path: "/etc/os-release" + {{- with .Values.operator.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.operator.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.operator.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/gpu-operator/templates/plugin_config.yaml b/charts/gpu-operator/templates/plugin_config.yaml new file mode 100644 index 0000000..21c2d9a --- /dev/null +++ b/charts/gpu-operator/templates/plugin_config.yaml @@ -0,0 +1,11 @@ +{{- if and (.Values.devicePlugin.config.create) (not (empty .Values.devicePlugin.config.data)) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.devicePlugin.config.name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} +data: {{ toYaml .Values.devicePlugin.config.data | nindent 2 }} +{{- end }} + \ No newline at end of file diff --git a/charts/gpu-operator/templates/readonlyfs_scc.openshift.yaml b/charts/gpu-operator/templates/readonlyfs_scc.openshift.yaml new file mode 100644 index 0000000..ff492d3 --- /dev/null +++ b/charts/gpu-operator/templates/readonlyfs_scc.openshift.yaml @@ -0,0 +1,49 @@ +{{- if .Values.platform.openshift }} +apiVersion: security.openshift.io/v1 +kind: SecurityContextConstraints +metadata: + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" + annotations: + kubernetes.io/description: restricted denies access to all host features and requires + pods to be run with a UID, read-only root filesystem and SELinux context that are + allocated to the namespace. This SCC is more restrictive than the default + restrictive SCC and it is used by default for authenticated users and operators and operands. + name: restricted-readonly +allowHostDirVolumePlugin: false +allowHostIPC: false +allowHostNetwork: false +allowHostPID: false +allowHostPorts: false +allowPrivilegeEscalation: true +allowPrivilegedContainer: false +allowedCapabilities: [] +defaultAddCapabilities: [] +fsGroup: + type: MustRunAs +groups: +- system:authenticated +priority: 0 +readOnlyRootFilesystem: true +requiredDropCapabilities: +- KILL +- MKNOD +- SETUID +- SETGID +runAsUser: + type: MustRunAsRange +seLinuxContext: + type: MustRunAs +supplementalGroups: + type: RunAsAny +users: +- system:serviceaccount:{{ $.Release.Namespace }}:gpu-operator +volumes: +- configMap +- downwardAPI +- emptyDir +- persistentVolumeClaim +- projected +- secret +{{- end }} diff --git a/charts/gpu-operator/templates/role.yaml b/charts/gpu-operator/templates/role.yaml new file mode 100644 index 0000000..9e5bced --- /dev/null +++ b/charts/gpu-operator/templates/role.yaml @@ -0,0 +1,84 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: gpu-operator + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" +rules: +- apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - apps + resources: + - controllerrevisions + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - "" + resources: + - configmaps + - endpoints + - pods + - pods/eviction + - secrets + - services + - services/finalizers + - serviceaccounts + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + - prometheusrules + verbs: + - get + - list + - create + - watch + - update + - delete diff --git a/charts/gpu-operator/templates/rolebinding.yaml b/charts/gpu-operator/templates/rolebinding.yaml new file mode 100644 index 0000000..c915a46 --- /dev/null +++ b/charts/gpu-operator/templates/rolebinding.yaml @@ -0,0 +1,15 @@ +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: gpu-operator + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" +subjects: +- kind: ServiceAccount + name: gpu-operator + namespace: {{ $.Release.Namespace }} +roleRef: + kind: Role + name: gpu-operator + apiGroup: rbac.authorization.k8s.io diff --git a/charts/gpu-operator/templates/serviceaccount.yaml b/charts/gpu-operator/templates/serviceaccount.yaml new file mode 100644 index 0000000..50555e5 --- /dev/null +++ b/charts/gpu-operator/templates/serviceaccount.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpu-operator + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" diff --git a/charts/gpu-operator/templates/upgrade_crd.yaml b/charts/gpu-operator/templates/upgrade_crd.yaml new file mode 100644 index 0000000..6552558 --- /dev/null +++ b/charts/gpu-operator/templates/upgrade_crd.yaml @@ -0,0 +1,95 @@ +{{- if .Values.operator.upgradeCRD }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpu-operator-upgrade-crd-hook-sa + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "0" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gpu-operator-upgrade-crd-hook-role + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "0" +rules: + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - create + - get + - list + - watch + - patch + - update +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: gpu-operator-upgrade-crd-hook-binding + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation + helm.sh/hook-weight: "0" +subjects: + - kind: ServiceAccount + name: gpu-operator-upgrade-crd-hook-sa + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: gpu-operator-upgrade-crd-hook-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: gpu-operator-upgrade-crd + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-upgrade + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: "gpu-operator" +spec: + template: + metadata: + name: gpu-operator-upgrade-crd + labels: + {{- include "gpu-operator.labels" . | nindent 8 }} + app.kubernetes.io/component: "gpu-operator" + spec: + serviceAccountName: gpu-operator-upgrade-crd-hook-sa + {{- if .Values.operator.imagePullSecrets }} + imagePullSecrets: + {{- range .Values.operator.imagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + {{- with .Values.operator.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: upgrade-crd + image: {{ include "gpu-operator.fullimage" . }} + imagePullPolicy: {{ .Values.operator.imagePullPolicy }} + command: + - /bin/sh + - -c + - > + kubectl apply -f /opt/gpu-operator/nvidia.com_clusterpolicies.yaml; + kubectl apply -f /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml; + {{- if .Values.nfd.enabled }} + kubectl apply -f /opt/gpu-operator/nfd-api-crds.yaml; + {{- end }} + restartPolicy: OnFailure +{{- end }} diff --git a/charts/gpu-operator/values.yaml b/charts/gpu-operator/values.yaml new file mode 100644 index 0000000..c0af3a0 --- /dev/null +++ b/charts/gpu-operator/values.yaml @@ -0,0 +1,15 @@ +toolkit: + env: + - name: CONTAINERD_CONFIG + value: "/etc/containerd/config.toml.tmpl" + - name: CONTAINERD_SOCKET + value: "/run/k3s/containerd/containerd.sock" + - name: CONTAINERD_RUNTIME_CLASS + value: "nvidia" + - name: CONTAINERD_SET_AS_DEFAULT + value: "true" + +devicePlugin: + config: + name: time-slicing-config-all + default: any diff --git a/charts/gpu-operator/values.yaml.bk b/charts/gpu-operator/values.yaml.bk new file mode 100644 index 0000000..5e404f0 --- /dev/null +++ b/charts/gpu-operator/values.yaml.bk @@ -0,0 +1,602 @@ +# Default values for gpu-operator. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +platform: + openshift: false + +nfd: + enabled: true + nodefeaturerules: false + +psa: + enabled: false + +cdi: + enabled: false + default: false + +sandboxWorkloads: + enabled: false + defaultWorkload: "container" + +hostPaths: + # rootFS represents the path to the root filesystem of the host. + # This is used by components that need to interact with the host filesystem + # and as such this must be a chroot-able filesystem. + # Examples include the MIG Manager and Toolkit Container which may need to + # stop, start, or restart systemd services + rootFS: "/" + + # driverInstallDir represents the root at which driver files including libraries, + # config files, and executables can be found. + driverInstallDir: "/run/nvidia/driver" + +daemonsets: + labels: {} + annotations: {} + priorityClassName: system-node-critical + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands + # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions + updateStrategy: "RollingUpdate" + # configuration for controlling rolling update of GPU Operands + rollingUpdate: + # maximum number of nodes to simultaneously apply pod updates on. + # can be specified either as number or percentage of nodes. Default 1. + maxUnavailable: "1" + +validator: + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + resources: {} + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + +operator: + repository: nvcr.io/nvidia + image: gpu-operator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + priorityClassName: system-node-critical + defaultRuntime: docker + runtimeClass: nvidia + use_ocp_driver_toolkit: false + # cleanup CRD on chart un-install + cleanupCRD: false + # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag + # to be passed during helm upgrade. + upgradeCRD: true + initContainer: + image: cuda + repository: nvcr.io/nvidia + version: 12.6.3-base-ubi9 + imagePullPolicy: IfNotPresent + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + annotations: + openshift.io/scc: restricted-readonly + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + logging: + # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano') + timeEncoding: epoch + # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity + level: info + # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn) + # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) + develMode: false + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + +mig: + strategy: single + +driver: + enabled: true + nvidiaDriverCRD: + enabled: false + deployDefaultCR: true + driverType: gpu + nodeSelector: {} + useOpenKernelModules: false + # use pre-compiled packages for NVIDIA driver installation. + # only supported for as a tech-preview feature on ubuntu22.04 kernels. + usePrecompiled: false + repository: nvcr.io/nvidia + image: driver + version: "550.127.08" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 10 + # nvidia-smi can take longer than 30s in some cases + # ensure enough timeout is set + timeoutSeconds: 60 + failureThreshold: 120 + rdma: + enabled: false + useHostMofed: false + upgradePolicy: + # global switch for automatic upgrade feature + # if set to false all other options are ignored + autoUpgrade: true + # how many nodes can be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel + maxParallelUpgrades: 1 + # maximum number of nodes with the driver installed, that can be unavailable during + # the upgrade. Value can be an absolute number (ex: 5) or + # a percentage of total nodes at the start of upgrade (ex: + # 10%). Absolute number is calculated from percentage by rounding + # up. By default, a fixed value of 25% is used.' + maxUnavailable: 25% + # options for waiting on pod(job) completions + waitForCompletion: + timeoutSeconds: 0 + podSelector: "" + # options for gpu pod deletion + gpuPodDeletion: + force: false + timeoutSeconds: 300 + deleteEmptyDir: false + # options for node drain (`kubectl drain`) before the driver reload + # this is required only if default GPU pod deletions done by the operator + # are not sufficient to re-install the driver + drain: + enable: false + force: false + podSelector: "" + # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries + timeoutSeconds: 300 + deleteEmptyDir: false + manager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.7.0 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: "0s" + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + env: [] + resources: {} + # Private mirror repository configuration + repoConfig: + configMapName: "" + # custom ssl key/certificate configuration + certConfig: + name: "" + # vGPU licensing configuration + licensingConfig: + configMapName: "" + nlsEnabled: true + # vGPU topology daemon configuration + virtualTopology: + config: "" + # kernel module configuration for NVIDIA driver + kernelModuleConfig: + name: "" + +toolkit: + enabled: true + repository: nvcr.io/nvidia/k8s + image: container-toolkit + version: v1.17.3-ubuntu20.04 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + installDir: "/usr/local/nvidia" + +devicePlugin: + enabled: true + repository: nvcr.io/nvidia + image: k8s-device-plugin + version: v0.17.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + resources: {} + # Plugin configuration + # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). + # Use "data" to build an integrated ConfigMap from a set of configurations as + # part of this helm chart. An example of setting "data" might be: + # config: + # name: device-plugin-config + # create: true + # data: + # default: |- + # version: v1 + # flags: + # migStrategy: none + # mig-single: |- + # version: v1 + # flags: + # migStrategy: single + # mig-mixed: |- + # version: v1 + # flags: + # migStrategy: mixed + config: + # Create a ConfigMap (default: false) + create: false + # ConfigMap name (either existing or to create a new one with create=true above) + name: "" + # Default config name within the ConfigMap + default: "" + # Data section for the ConfigMap to create (i.e only applies when create=true) + data: {} + # MPS related configuration for the plugin + mps: + # MPS root path on the host + root: "/run/nvidia/mps" + +# standalone dcgm hostengine +dcgm: + # disabled by default to use embedded nv-hostengine by exporter + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: dcgm + version: 3.3.9-1-ubuntu22.04 + imagePullPolicy: IfNotPresent + args: [] + env: [] + resources: {} + +dcgmExporter: + enabled: true + repository: nvcr.io/nvidia/k8s + image: dcgm-exporter + version: 3.3.9-3.6.1-ubuntu22.04 + imagePullPolicy: IfNotPresent + env: + - name: DCGM_EXPORTER_LISTEN + value: ":9400" + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + resources: {} + serviceMonitor: + enabled: false + interval: 15s + honorLabels: false + additionalLabels: {} + relabelings: [] + # - source_labels: + # - __meta_kubernetes_pod_node_name + # regex: (.*) + # target_label: instance + # replacement: $1 + # action: replace + # DCGM Exporter configuration + # This block is used to configure DCGM Exporter to emit a customized list of metrics. + # Use "name" to either point to an existing ConfigMap or to create a new one with a + # list of configurations (i.e with create=true). + # When pointing to an existing ConfigMap, the ConfigMap must exist in the same namespace as the release. + # The metrics are expected to be listed under a key called `dcgm-metrics.csv`. + # Use "data" to build an integrated ConfigMap from a set of custom metrics as + # part of the chart. An example of some custom metrics are shown below. Note that + # the contents of "data" must be in CSV format and be valid DCGM Exporter metric configurations. + # config: + # name: custom-dcgm-exporter-metrics + # create: true + # data: |- + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + + # Clocks + # DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + # DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +gfd: + enabled: true + repository: nvcr.io/nvidia + image: k8s-device-plugin + version: v0.17.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + resources: {} + +migManager: + enabled: true + repository: nvcr.io/nvidia/cloud-native + image: k8s-mig-manager + version: v0.10.0-ubuntu20.04 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: WITH_REBOOT + value: "false" + resources: {} + # MIG configuration + # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). + # Use "data" to build an integrated ConfigMap from a set of configurations as + # part of this helm chart. An example of setting "data" might be: + # config: + # name: custom-mig-parted-configs + # create: true + # data: |- + # config.yaml: |- + # version: v1 + # mig-configs: + # all-disabled: + # - devices: all + # mig-enabled: false + # custom-mig: + # - devices: [0] + # mig-enabled: false + # - devices: [1] + # mig-enabled: true + # mig-devices: + # "1g.10gb": 7 + # - devices: [2] + # mig-enabled: true + # mig-devices: + # "2g.20gb": 2 + # "3g.40gb": 1 + # - devices: [3] + # mig-enabled: true + # mig-devices: + # "3g.40gb": 1 + # "4g.40gb": 1 + config: + default: "all-disabled" + # Create a ConfigMap (default: false) + create: false + # ConfigMap name (either existing or to create a new one with create=true above) + name: "" + # Data section for the ConfigMap to create (i.e only applies when create=true) + data: {} + gpuClientsConfig: + name: "" + +nodeStatusExporter: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + resources: {} + +gds: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: nvidia-fs + version: "2.20.5" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + +gdrcopy: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gdrdrv + version: "v2.4.1-2" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + +vgpuManager: + enabled: false + repository: "" + image: vgpu-manager + version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + driverManager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.7.0 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + +vgpuDeviceManager: + enabled: true + repository: nvcr.io/nvidia/cloud-native + image: vgpu-device-manager + version: v0.2.8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + config: + name: "" + default: "default" + +vfioManager: + enabled: true + repository: nvcr.io/nvidia + image: cuda + version: 12.6.3-base-ubi9 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + driverManager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.7.0 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + +kataManager: + enabled: false + config: + artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses" + runtimeClasses: + - name: kata-nvidia-gpu + nodeSelector: {} + artifacts: + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 + pullSecret: "" + - name: kata-nvidia-gpu-snp + nodeSelector: + "nvidia.com/cc.capable": "true" + artifacts: + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp + pullSecret: "" + repository: nvcr.io/nvidia/cloud-native + image: k8s-kata-manager + version: v0.2.2 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + +sandboxDevicePlugin: + enabled: true + repository: nvcr.io/nvidia + image: kubevirt-gpu-device-plugin + version: v1.2.10 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: [] + resources: {} + +ccManager: + enabled: false + defaultMode: "off" + repository: nvcr.io/nvidia/cloud-native + image: k8s-cc-manager + version: v0.1.1 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: CC_CAPABLE_DEVICE_IDS + value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d" + resources: {} + +node-feature-discovery: + enableNodeFeatureApi: true + priorityClassName: system-node-critical + gc: + enable: true + replicaCount: 1 + serviceAccount: + name: node-feature-discovery + create: false + worker: + serviceAccount: + name: node-feature-discovery + # disable creation to avoid duplicate serviceaccount creation by master spec below + create: false + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "0200" + - "0207" + - "0300" + - "0302" + deviceLabelFields: + - vendor + master: + serviceAccount: + name: node-feature-discovery + create: true + config: + extraLabelNs: ["nvidia.com"] + # noPublish: false + # resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"] + # enableTaints: false + # labelWhiteList: "nvidia.com/gpu" diff --git a/resources/gpu-slice/configmap.yaml b/resources/gpu-slice/configmap.yaml new file mode 100644 index 0000000..ede56ea --- /dev/null +++ b/resources/gpu-slice/configmap.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: time-slicing-config-all + namespace: gpu-operator +data: + any: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 4