added gpu-operator

This commit is contained in:
nomadics9 2025-01-12 04:03:04 +03:00
parent cb672d1f0d
commit a2b2bd17c5
48 changed files with 8358 additions and 0 deletions

View file

@ -0,0 +1,22 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

View file

@ -0,0 +1,6 @@
dependencies:
- name: node-feature-discovery
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
version: 0.16.6
digest: sha256:e7b02cbdf9daff49892c0b74c50da2ed11e18eff2105a1b1abc9a8f2ebd8be47
generated: "2024-10-31T07:12:50.141904-07:00"

View file

@ -0,0 +1,23 @@
apiVersion: v2
appVersion: v24.9.1
dependencies:
- condition: nfd.enabled
name: node-feature-discovery
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
version: v0.16.6
description: NVIDIA GPU Operator creates/configures/manages GPUs atop Kubernetes
home: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/overview.html
icon: https://assets.nvidiagrid.net/ngc/logos/GPUoperator.png
keywords:
- gpu
- cuda
- compute
- operator
- deep learning
- monitoring
- tesla
kubeVersion: '>= 1.16.0-0'
name: gpu-operator
sources:
- https://github.com/NVIDIA/gpu-operator
version: v24.9.1

View file

@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

View file

@ -0,0 +1,14 @@
apiVersion: v2
appVersion: v0.16.6
description: 'Detects hardware features available on each node in a Kubernetes cluster,
and advertises those features using node labels. '
home: https://github.com/kubernetes-sigs/node-feature-discovery
keywords:
- feature-discovery
- feature-detection
- node-labels
name: node-feature-discovery
sources:
- https://github.com/kubernetes-sigs/node-feature-discovery
type: application
version: 0.16.6

View file

@ -0,0 +1,10 @@
# Node Feature Discovery
Node Feature Discovery (NFD) is a Kubernetes add-on for detecting hardware
features and system configuration. Detected features are advertised as node
labels. NFD provides flexible configuration and extension points for a wide
range of vendor and application specific node labeling needs.
See
[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html)
for deployment instructions.

View file

@ -0,0 +1,710 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
name: nodefeatures.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeature
listKind: NodeFeatureList
plural: nodefeatures
singular: nodefeature
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: |-
NodeFeature resource holds the features discovered for one node in the
cluster.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Specification of the NodeFeature, containing features discovered
for a node.
properties:
features:
description: Features is the full "raw" features data that has been
discovered.
properties:
attributes:
additionalProperties:
description: AttributeFeatureSet is a set of features having
string value.
properties:
elements:
additionalProperties:
type: string
description: Individual features of the feature set.
type: object
required:
- elements
type: object
description: Attributes contains all the attribute-type features
of the node.
type: object
flags:
additionalProperties:
description: FlagFeatureSet is a set of simple features only
containing names without values.
properties:
elements:
additionalProperties:
description: Nil is a dummy empty struct for protobuf
compatibility
type: object
description: Individual features of the feature set.
type: object
required:
- elements
type: object
description: Flags contains all the flag-type features of the
node.
type: object
instances:
additionalProperties:
description: InstanceFeatureSet is a set of features each of
which is an instance having multiple attributes.
properties:
elements:
description: Individual features of the feature set.
items:
description: InstanceFeature represents one instance of
a complex features, e.g. a device.
properties:
attributes:
additionalProperties:
type: string
description: Attributes of the instance feature.
type: object
required:
- attributes
type: object
type: array
required:
- elements
type: object
description: Instances contains all the instance-type features
of the node.
type: object
type: object
labels:
additionalProperties:
type: string
description: Labels is the set of node labels that are requested to
be created.
type: object
type: object
required:
- spec
type: object
served: true
storage: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
name: nodefeaturegroups.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeatureGroup
listKind: NodeFeatureGroupList
plural: nodefeaturegroups
shortNames:
- nfg
singular: nodefeaturegroup
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NodeFeatureGroup resource holds Node pools by featureGroup
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the rules to be evaluated.
properties:
featureGroupRules:
description: List of rules to evaluate to determine nodes that belong
in this group.
items:
description: GroupRule defines a rule for nodegroup filtering.
properties:
matchAny:
description: MatchAny specifies a list of matchers one of which
must match.
items:
description: MatchAnyElem specifies one sub-matcher of MatchAny.
properties:
matchFeatures:
description: MatchFeatures specifies a set of matcher
terms all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature
set to match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
required:
- matchFeatures
type: object
type: array
matchFeatures:
description: MatchFeatures specifies a set of matcher terms
all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature set to
match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
name:
description: Name of the rule.
type: string
required:
- name
type: object
type: array
required:
- featureGroupRules
type: object
status:
description: |-
Status of the NodeFeatureGroup after the most recent evaluation of the
specification.
properties:
nodes:
description: Nodes is a list of FeatureGroupNode in the cluster that
match the featureGroupRules
items:
properties:
name:
description: Name of the node.
type: string
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
name: nodefeaturerules.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeatureRule
listKind: NodeFeatureRuleList
plural: nodefeaturerules
shortNames:
- nfr
singular: nodefeaturerule
scope: Cluster
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: |-
NodeFeatureRule resource specifies a configuration for feature-based
customization of node objects, such as node labeling.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the rules to be evaluated.
properties:
rules:
description: Rules is a list of node customization rules.
items:
description: Rule defines a rule for node customization such as
labeling.
properties:
annotations:
additionalProperties:
type: string
description: Annotations to create if the rule matches.
type: object
extendedResources:
additionalProperties:
type: string
description: ExtendedResources to create if the rule matches.
type: object
labels:
additionalProperties:
type: string
description: Labels to create if the rule matches.
type: object
labelsTemplate:
description: |-
LabelsTemplate specifies a template to expand for dynamically generating
multiple labels. Data (after template expansion) must be keys with an
optional value (<key>[=<value>]) separated by newlines.
type: string
matchAny:
description: MatchAny specifies a list of matchers one of which
must match.
items:
description: MatchAnyElem specifies one sub-matcher of MatchAny.
properties:
matchFeatures:
description: MatchFeatures specifies a set of matcher
terms all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature
set to match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
required:
- matchFeatures
type: object
type: array
matchFeatures:
description: MatchFeatures specifies a set of matcher terms
all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature set to
match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
name:
description: Name of the rule.
type: string
taints:
description: Taints to create if the rule matches.
items:
description: |-
The node this Taint is attached to has the "effect" on
any pod that does not tolerate the Taint.
properties:
effect:
description: |-
Required. The effect of the taint on pods
that do not tolerate the taint.
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Required. The taint key to be applied to
a node.
type: string
timeAdded:
description: |-
TimeAdded represents the time at which the taint was added.
It is only written for NoExecute taints.
format: date-time
type: string
value:
description: The taint value corresponding to the taint
key.
type: string
required:
- effect
- key
type: object
type: array
vars:
additionalProperties:
type: string
description: |-
Vars is the variables to store if the rule matches. Variables do not
directly inflict any changes in the node object. However, they can be
referenced from other rules enabling more complex rule hierarchies,
without exposing intermediary output values as labels.
type: object
varsTemplate:
description: |-
VarsTemplate specifies a template to expand for dynamically generating
multiple variables. Data (after template expansion) must be keys with an
optional value (<key>[=<value>]) separated by newlines.
type: string
required:
- name
type: object
type: array
required:
- rules
type: object
required:
- spec
type: object
served: true
storage: true

View file

@ -0,0 +1,107 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "node-feature-discovery.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "node-feature-discovery.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Allow the release namespace to be overridden for multi-namespace deployments in combined charts
*/}}
{{- define "node-feature-discovery.namespace" -}}
{{- if .Values.namespaceOverride -}}
{{- .Values.namespaceOverride -}}
{{- else -}}
{{- .Release.Namespace -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "node-feature-discovery.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Common labels
*/}}
{{- define "node-feature-discovery.labels" -}}
helm.sh/chart: {{ include "node-feature-discovery.chart" . }}
{{ include "node-feature-discovery.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end -}}
{{/*
Selector labels
*/}}
{{- define "node-feature-discovery.selectorLabels" -}}
app.kubernetes.io/name: {{ include "node-feature-discovery.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end -}}
{{/*
Create the name of the service account which the nfd master will use
*/}}
{{- define "node-feature-discovery.master.serviceAccountName" -}}
{{- if .Values.master.serviceAccount.create -}}
{{ default (include "node-feature-discovery.fullname" .) .Values.master.serviceAccount.name }}
{{- else -}}
{{ default "default" .Values.master.serviceAccount.name }}
{{- end -}}
{{- end -}}
{{/*
Create the name of the service account which the nfd worker will use
*/}}
{{- define "node-feature-discovery.worker.serviceAccountName" -}}
{{- if .Values.worker.serviceAccount.create -}}
{{ default (printf "%s-worker" (include "node-feature-discovery.fullname" .)) .Values.worker.serviceAccount.name }}
{{- else -}}
{{ default "default" .Values.worker.serviceAccount.name }}
{{- end -}}
{{- end -}}
{{/*
Create the name of the service account which topologyUpdater will use
*/}}
{{- define "node-feature-discovery.topologyUpdater.serviceAccountName" -}}
{{- if .Values.topologyUpdater.serviceAccount.create -}}
{{ default (printf "%s-topology-updater" (include "node-feature-discovery.fullname" .)) .Values.topologyUpdater.serviceAccount.name }}
{{- else -}}
{{ default "default" .Values.topologyUpdater.serviceAccount.name }}
{{- end -}}
{{- end -}}
{{/*
Create the name of the service account which nfd-gc will use
*/}}
{{- define "node-feature-discovery.gc.serviceAccountName" -}}
{{- if .Values.gc.serviceAccount.create -}}
{{ default (printf "%s-gc" (include "node-feature-discovery.fullname" .)) .Values.gc.serviceAccount.name }}
{{- else -}}
{{ default "default" .Values.gc.serviceAccount.name }}
{{- end -}}
{{- end -}}

View file

@ -0,0 +1,80 @@
{{- if .Values.tls.certManager }}
{{- if .Values.master.enable }}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: nfd-master-cert
namespace: {{ include "node-feature-discovery.namespace" . }}
spec:
secretName: nfd-master-cert
subject:
organizations:
- node-feature-discovery
commonName: nfd-master
dnsNames:
# must match the service name
- {{ include "node-feature-discovery.fullname" . }}-master
# first one is configured for use by the worker; below are for completeness
- {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc
- {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local
issuerRef:
name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }}
{{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }}
kind: {{ .Values.tls.certManagerCertificate.issuerKind }}
{{- else }}
kind: Issuer
{{- end }}
group: cert-manager.io
{{- end }}
---
{{- if .Values.worker.enable }}
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: nfd-worker-cert
namespace: {{ include "node-feature-discovery.namespace" . }}
spec:
secretName: nfd-worker-cert
subject:
organizations:
- node-feature-discovery
commonName: nfd-worker
dnsNames:
- {{ include "node-feature-discovery.fullname" . }}-worker.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local
issuerRef:
name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }}
{{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }}
kind: {{ .Values.tls.certManagerCertificate.issuerKind }}
{{- else }}
kind: Issuer
{{- end }}
group: cert-manager.io
{{- end }}
{{- if .Values.topologyUpdater.enable }}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: nfd-topology-updater-cert
namespace: {{ include "node-feature-discovery.namespace" . }}
spec:
secretName: nfd-topology-updater-cert
subject:
organizations:
- node-feature-discovery
commonName: nfd-topology-updater
dnsNames:
- {{ include "node-feature-discovery.fullname" . }}-topology-updater.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local
issuerRef:
name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }}
{{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }}
kind: {{ .Values.tls.certManagerCertificate.issuerKind }}
{{- else }}
kind: Issuer
{{- end }}
group: cert-manager.io
{{- end }}
{{- end }}

View file

@ -0,0 +1,42 @@
{{- if and .Values.tls.certManager (not .Values.tls.certManagerCertificate.issuerName ) }}
# See https://cert-manager.io/docs/configuration/selfsigned/#bootstrapping-ca-issuers
# - Create a self signed issuer
# - Use this to create a CA cert
# - Use this to now create a CA issuer
---
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: nfd-ca-bootstrap
namespace: {{ include "node-feature-discovery.namespace" . }}
spec:
selfSigned: {}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: nfd-ca-cert
namespace: {{ include "node-feature-discovery.namespace" . }}
spec:
isCA: true
secretName: nfd-ca-cert
subject:
organizations:
- node-feature-discovery
commonName: nfd-ca-cert
issuerRef:
name: nfd-ca-bootstrap
kind: Issuer
group: cert-manager.io
---
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: nfd-ca-issuer
namespace: {{ include "node-feature-discovery.namespace" . }}
spec:
ca:
secretName: nfd-ca-cert
{{- end }}

View file

@ -0,0 +1,133 @@
{{- if and .Values.master.enable .Values.master.rbac.create }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "node-feature-discovery.fullname" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- get
- patch
- update
- list
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
- nodefeaturerules
- nodefeaturegroups
verbs:
- get
- list
- watch
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeaturegroups/status
verbs:
- patch
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- coordination.k8s.io
resources:
- leases
resourceNames:
- "nfd-master.nfd.kubernetes.io"
verbs:
- get
- update
{{- end }}
{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-topology-updater
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- apiGroups:
- ""
resources:
- nodes/proxy
verbs:
- get
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- topology.node.k8s.io
resources:
- noderesourcetopologies
verbs:
- create
- get
- update
{{- end }}
{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-gc
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- nodes/proxy
verbs:
- get
- apiGroups:
- topology.node.k8s.io
resources:
- noderesourcetopologies
verbs:
- delete
- list
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
verbs:
- delete
- list
{{- end }}

View file

@ -0,0 +1,52 @@
{{- if and .Values.master.enable .Values.master.rbac.create }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "node-feature-discovery.fullname" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "node-feature-discovery.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ include "node-feature-discovery.master.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
{{- end }}
{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-topology-updater
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "node-feature-discovery.fullname" . }}-topology-updater
subjects:
- kind: ServiceAccount
name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
{{- end }}
{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-gc
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "node-feature-discovery.fullname" . }}-gc
subjects:
- kind: ServiceAccount
name: {{ include "node-feature-discovery.gc.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
{{- end }}

View file

@ -0,0 +1,152 @@
{{- if .Values.master.enable }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-master
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
role: master
{{- with .Values.master.deploymentAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
replicas: {{ .Values.master.replicaCount }}
revisionHistoryLimit: {{ .Values.master.revisionHistoryLimit }}
selector:
matchLabels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 6 }}
role: master
template:
metadata:
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 8 }}
role: master
{{- with .Values.master.annotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "node-feature-discovery.master.serviceAccountName" . }}
enableServiceLinks: false
securityContext:
{{- toYaml .Values.master.podSecurityContext | nindent 8 }}
hostNetwork: {{ .Values.master.hostNetwork }}
containers:
- name: master
securityContext:
{{- toYaml .Values.master.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe:
{{- toYaml .Values.master.livenessProbe | nindent 12 }}
readinessProbe:
{{- toYaml .Values.master.readinessProbe | nindent 12 }}
ports:
- containerPort: {{ .Values.master.port | default "8080" }}
name: grpc
- containerPort: {{ .Values.master.metricsPort | default "8081" }}
name: metrics
- containerPort: {{ .Values.master.healthPort | default "8082" }}
name: health
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- with .Values.master.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end}}
command:
- "nfd-master"
resources:
{{- toYaml .Values.master.resources | nindent 12 }}
args:
{{- if .Values.master.instance | empty | not }}
- "-instance={{ .Values.master.instance }}"
{{- end }}
{{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }}
- "-port={{ .Values.master.port | default "8080" }}"
{{- else if gt (int .Values.master.replicaCount) 1 }}
- "-enable-leader-election"
{{- end }}
{{- if .Values.master.extraLabelNs | empty | not }}
- "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}"
{{- end }}
{{- if .Values.master.denyLabelNs | empty | not }}
- "-deny-label-ns={{- join "," .Values.master.denyLabelNs }}"
{{- end }}
{{- if .Values.master.resourceLabels | empty | not }}
- "-resource-labels={{- join "," .Values.master.resourceLabels }}"
{{- end }}
{{- if .Values.master.enableTaints }}
- "-enable-taints"
{{- end }}
{{- if .Values.master.crdController | kindIs "invalid" | not }}
- "-crd-controller={{ .Values.master.crdController }}"
{{- else }}
## By default, disable crd controller for other than the default instances
- "-crd-controller={{ .Values.master.instance | empty }}"
{{- end }}
{{- if .Values.master.featureRulesController | kindIs "invalid" | not }}
- "-featurerules-controller={{ .Values.master.featureRulesController }}"
{{- end }}
{{- if .Values.master.resyncPeriod }}
- "-resync-period={{ .Values.master.resyncPeriod }}"
{{- end }}
{{- if .Values.master.nfdApiParallelism | empty | not }}
- "-nfd-api-parallelism={{ .Values.master.nfdApiParallelism }}"
{{- end }}
{{- if .Values.tls.enable }}
- "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt"
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
# Go over featureGates and add the feature-gate flag
{{- range $key, $value := .Values.featureGates }}
- "-feature-gates={{ $key }}={{ $value }}"
{{- end }}
- "-metrics={{ .Values.master.metricsPort | default "8081" }}"
- "-grpc-health={{ .Values.master.healthPort | default "8082" }}"
volumeMounts:
{{- if .Values.tls.enable }}
- name: nfd-master-cert
mountPath: "/etc/kubernetes/node-feature-discovery/certs"
readOnly: true
{{- end }}
- name: nfd-master-conf
mountPath: "/etc/kubernetes/node-feature-discovery"
readOnly: true
volumes:
{{- if .Values.tls.enable }}
- name: nfd-master-cert
secret:
secretName: nfd-master-cert
{{- end }}
- name: nfd-master-conf
configMap:
name: {{ include "node-feature-discovery.fullname" . }}-master-conf
items:
- key: nfd-master.conf
path: nfd-master.conf
{{- with .Values.master.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.master.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.master.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,85 @@
{{- if and .Values.gc.enable (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-gc
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
role: gc
{{- with .Values.gc.deploymentAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
replicas: {{ .Values.gc.replicaCount | default 1 }}
revisionHistoryLimit: {{ .Values.gc.revisionHistoryLimit }}
selector:
matchLabels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 6 }}
role: gc
template:
metadata:
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 8 }}
role: gc
{{- with .Values.gc.annotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
serviceAccountName: {{ include "node-feature-discovery.gc.serviceAccountName" . }}
dnsPolicy: ClusterFirstWithHostNet
{{- with .Values.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.gc.podSecurityContext | nindent 8 }}
hostNetwork: {{ .Values.gc.hostNetwork }}
containers:
- name: gc
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- with .Values.gc.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end}}
command:
- "nfd-gc"
args:
{{- if .Values.gc.interval | empty | not }}
- "-gc-interval={{ .Values.gc.interval }}"
{{- end }}
resources:
{{- toYaml .Values.gc.resources | nindent 12 }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsNonRoot: true
ports:
- name: metrics
containerPort: {{ .Values.gc.metricsPort | default "8081"}}
{{- with .Values.gc.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.gc.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.gc.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,12 @@
{{- if .Values.master.enable }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-master-conf
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
data:
nfd-master.conf: |-
{{- .Values.master.config | toYaml | nindent 4 }}
{{- end }}

View file

@ -0,0 +1,12 @@
{{- if .Values.topologyUpdater.enable -}}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
data:
nfd-topology-updater.conf: |-
{{- .Values.topologyUpdater.config | toYaml | nindent 4 }}
{{- end }}

View file

@ -0,0 +1,12 @@
{{- if .Values.worker.enable }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-worker-conf
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
data:
nfd-worker.conf: |-
{{- .Values.worker.config | toYaml | nindent 4 }}
{{- end }}

View file

@ -0,0 +1,94 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-prune
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-prune
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- get
- patch
- update
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-prune
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "node-feature-discovery.fullname" . }}-prune
subjects:
- kind: ServiceAccount
name: {{ include "node-feature-discovery.fullname" . }}-prune
namespace: {{ include "node-feature-discovery.namespace" . }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-prune
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
template:
metadata:
labels:
{{- include "node-feature-discovery.labels" . | nindent 8 }}
role: prune
spec:
serviceAccountName: {{ include "node-feature-discovery.fullname" . }}-prune
containers:
- name: nfd-master
securityContext:
{{- toYaml .Values.master.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- "nfd-master"
args:
- "-prune"
{{- if .Values.master.instance | empty | not }}
- "-instance={{ .Values.master.instance }}"
{{- end }}
restartPolicy: Never
{{- with .Values.master.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.master.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.master.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}

View file

@ -0,0 +1,26 @@
{{- if .Values.prometheus.enable }}
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: {{ include "node-feature-discovery.fullname" . }}
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 4 }}
{{- with .Values.prometheus.labels }}
{{ toYaml . | nindent 4 }}
{{- end }}
spec:
podMetricsEndpoints:
- honorLabels: true
interval: {{ .Values.prometheus.scrapeInterval }}
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- {{ include "node-feature-discovery.namespace" . }}
selector:
matchExpressions:
- {key: app.kubernetes.io/instance, operator: In, values: ["{{ .Release.Name }}"]}
- {key: app.kubernetes.io/name, operator: In, values: ["{{ include "node-feature-discovery.name" . }}"]}
{{- end }}

View file

@ -0,0 +1,24 @@
{{- if and .Values.worker.enable .Values.worker.rbac.create }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-worker
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
rules:
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
verbs:
- create
- get
- update
- apiGroups:
- ""
resources:
- pods
verbs:
- get
{{- end }}

View file

@ -0,0 +1,18 @@
{{- if and .Values.worker.enable .Values.worker.rbac.create }}
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-worker
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "node-feature-discovery.fullname" . }}-worker
subjects:
- kind: ServiceAccount
name: {{ include "node-feature-discovery.worker.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
{{- end }}

View file

@ -0,0 +1,20 @@
{{- if and (not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi)) .Values.master.enable }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-master
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
role: master
spec:
type: {{ .Values.master.service.type }}
ports:
- port: {{ .Values.master.service.port | default "8080" }}
targetPort: grpc
protocol: TCP
name: grpc
selector:
{{- include "node-feature-discovery.selectorLabels" . | nindent 4 }}
role: master
{{- end}}

View file

@ -0,0 +1,58 @@
{{- if and .Values.master.enable .Values.master.serviceAccount.create }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "node-feature-discovery.master.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
{{- with .Values.master.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.serviceAccount.create }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
{{- with .Values.topologyUpdater.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
{{- if and .Values.gc.enable .Values.gc.serviceAccount.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "node-feature-discovery.gc.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
{{- with .Values.gc.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
{{- if and .Values.worker.enable .Values.worker.serviceAccount.create }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "node-feature-discovery.worker.serviceAccountName" . }}
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
{{- with .Values.worker.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,278 @@
{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.createCRDs -}}
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
api-approved.kubernetes.io: https://github.com/kubernetes/enhancements/pull/1870
controller-gen.kubebuilder.io/version: v0.11.2
creationTimestamp: null
name: noderesourcetopologies.topology.node.k8s.io
spec:
group: topology.node.k8s.io
names:
kind: NodeResourceTopology
listKind: NodeResourceTopologyList
plural: noderesourcetopologies
shortNames:
- node-res-topo
singular: noderesourcetopology
scope: Cluster
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NodeResourceTopology describes node resources and their topology.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
topologyPolicies:
items:
type: string
type: array
zones:
description: ZoneList contains an array of Zone objects.
items:
description: Zone represents a resource topology zone, e.g. socket,
node, die or core.
properties:
attributes:
description: AttributeList contains an array of AttributeInfo objects.
items:
description: AttributeInfo contains one attribute of a Zone.
properties:
name:
type: string
value:
type: string
required:
- name
- value
type: object
type: array
costs:
description: CostList contains an array of CostInfo objects.
items:
description: CostInfo describes the cost (or distance) between
two Zones.
properties:
name:
type: string
value:
format: int64
type: integer
required:
- name
- value
type: object
type: array
name:
type: string
parent:
type: string
resources:
description: ResourceInfoList contains an array of ResourceInfo
objects.
items:
description: ResourceInfo contains information about one resource
type.
properties:
allocatable:
anyOf:
- type: integer
- type: string
description: Allocatable quantity of the resource, corresponding
to allocatable in node status, i.e. total amount of this
resource available to be used by pods.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
available:
anyOf:
- type: integer
- type: string
description: Available is the amount of this resource currently
available for new (to be scheduled) pods, i.e. Allocatable
minus the resources reserved by currently running pods.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
capacity:
anyOf:
- type: integer
- type: string
description: Capacity of the resource, corresponding to capacity
in node status, i.e. total amount of this resource that
the node has.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
name:
description: Name of the resource.
type: string
required:
- allocatable
- available
- capacity
- name
type: object
type: array
type:
type: string
required:
- name
- type
type: object
type: array
required:
- topologyPolicies
- zones
type: object
served: true
storage: false
- name: v1alpha2
schema:
openAPIV3Schema:
description: NodeResourceTopology describes node resources and their topology.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
attributes:
description: AttributeList contains an array of AttributeInfo objects.
items:
description: AttributeInfo contains one attribute of a Zone.
properties:
name:
type: string
value:
type: string
required:
- name
- value
type: object
type: array
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
topologyPolicies:
description: 'DEPRECATED (to be removed in v1beta1): use top level attributes
if needed'
items:
type: string
type: array
zones:
description: ZoneList contains an array of Zone objects.
items:
description: Zone represents a resource topology zone, e.g. socket,
node, die or core.
properties:
attributes:
description: AttributeList contains an array of AttributeInfo objects.
items:
description: AttributeInfo contains one attribute of a Zone.
properties:
name:
type: string
value:
type: string
required:
- name
- value
type: object
type: array
costs:
description: CostList contains an array of CostInfo objects.
items:
description: CostInfo describes the cost (or distance) between
two Zones.
properties:
name:
type: string
value:
format: int64
type: integer
required:
- name
- value
type: object
type: array
name:
type: string
parent:
type: string
resources:
description: ResourceInfoList contains an array of ResourceInfo
objects.
items:
description: ResourceInfo contains information about one resource
type.
properties:
allocatable:
anyOf:
- type: integer
- type: string
description: Allocatable quantity of the resource, corresponding
to allocatable in node status, i.e. total amount of this
resource available to be used by pods.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
available:
anyOf:
- type: integer
- type: string
description: Available is the amount of this resource currently
available for new (to be scheduled) pods, i.e. Allocatable
minus the resources reserved by currently running pods.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
capacity:
anyOf:
- type: integer
- type: string
description: Capacity of the resource, corresponding to capacity
in node status, i.e. total amount of this resource that
the node has.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
name:
description: Name of the resource.
type: string
required:
- allocatable
- available
- capacity
- name
type: object
type: array
type:
type: string
required:
- name
- type
type: object
type: array
required:
- zones
type: object
served: true
storage: true
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []
{{- end }}

View file

@ -0,0 +1,171 @@
{{- if .Values.topologyUpdater.enable -}}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-topology-updater
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
role: topology-updater
{{- with .Values.topologyUpdater.daemonsetAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
revisionHistoryLimit: {{ .Values.topologyUpdater.revisionHistoryLimit }}
selector:
matchLabels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 6 }}
role: topology-updater
template:
metadata:
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 8 }}
role: topology-updater
{{- with .Values.topologyUpdater.annotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
serviceAccountName: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }}
dnsPolicy: ClusterFirstWithHostNet
{{- with .Values.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.topologyUpdater.podSecurityContext | nindent 8 }}
hostNetwork: {{ .Values.topologyUpdater.hostNetwork }}
containers:
- name: topology-updater
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
livenessProbe:
{{- toYaml .Values.topologyUpdater.livenessProbe | nindent 10 }}
readinessProbe:
{{- toYaml .Values.topologyUpdater.readinessProbe | nindent 10 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NODE_ADDRESS
valueFrom:
fieldRef:
fieldPath: status.hostIP
{{- with .Values.topologyUpdater.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end}}
command:
- "nfd-topology-updater"
args:
- "-podresources-socket=/host-var/lib/kubelet-podresources/kubelet.sock"
{{- if .Values.topologyUpdater.updateInterval | empty | not }}
- "-sleep-interval={{ .Values.topologyUpdater.updateInterval }}"
{{- else }}
- "-sleep-interval=3s"
{{- end }}
{{- if .Values.topologyUpdater.watchNamespace | empty | not }}
- "-watch-namespace={{ .Values.topologyUpdater.watchNamespace }}"
{{- else }}
- "-watch-namespace=*"
{{- end }}
{{- if .Values.tls.enable }}
- "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt"
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
{{- if not .Values.topologyUpdater.podSetFingerprint }}
- "-pods-fingerprint=false"
{{- end }}
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- "-kubelet-config-uri=file:///host-var/kubelet-config"
{{- end }}
{{- if .Values.topologyUpdater.kubeletStateDir | empty }}
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}
- "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}"
ports:
- containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
name: metrics
- containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }}
name: health
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
mountPath: /host-var/kubelet-config
{{- end }}
- name: kubelet-podresources-sock
mountPath: /host-var/lib/kubelet-podresources/kubelet.sock
- name: host-sys
mountPath: /host-sys
{{- if .Values.topologyUpdater.kubeletStateDir | empty | not }}
- name: kubelet-state-files
mountPath: /host-var/lib/kubelet
readOnly: true
{{- end }}
{{- if .Values.tls.enable }}
- name: nfd-topology-updater-cert
mountPath: "/etc/kubernetes/node-feature-discovery/certs"
readOnly: true
{{- end }}
- name: nfd-topology-updater-conf
mountPath: "/etc/kubernetes/node-feature-discovery"
readOnly: true
resources:
{{- toYaml .Values.topologyUpdater.resources | nindent 12 }}
securityContext:
{{- toYaml .Values.topologyUpdater.securityContext | nindent 12 }}
volumes:
- name: host-sys
hostPath:
path: "/sys"
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
hostPath:
path: {{ .Values.topologyUpdater.kubeletConfigPath }}
{{- end }}
- name: kubelet-podresources-sock
hostPath:
{{- if .Values.topologyUpdater.kubeletPodResourcesSockPath | empty | not }}
path: {{ .Values.topologyUpdater.kubeletPodResourcesSockPath }}
{{- else }}
path: /var/lib/kubelet/pod-resources/kubelet.sock
{{- end }}
{{- if .Values.topologyUpdater.kubeletStateDir | empty | not }}
- name: kubelet-state-files
hostPath:
path: {{ .Values.topologyUpdater.kubeletStateDir }}
{{- end }}
- name: nfd-topology-updater-conf
configMap:
name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf
items:
- key: nfd-topology-updater.conf
path: nfd-topology-updater.conf
{{- if .Values.tls.enable }}
- name: nfd-topology-updater-cert
secret:
secretName: nfd-topology-updater-cert
{{- end }}
{{- with .Values.topologyUpdater.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.topologyUpdater.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.topologyUpdater.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,186 @@
{{- if .Values.worker.enable }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "node-feature-discovery.fullname" . }}-worker
namespace: {{ include "node-feature-discovery.namespace" . }}
labels:
{{- include "node-feature-discovery.labels" . | nindent 4 }}
role: worker
{{- with .Values.worker.daemonsetAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
revisionHistoryLimit: {{ .Values.worker.revisionHistoryLimit }}
selector:
matchLabels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 6 }}
role: worker
template:
metadata:
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 8 }}
role: worker
{{- with .Values.worker.annotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
dnsPolicy: ClusterFirstWithHostNet
{{- with .Values.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "node-feature-discovery.worker.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.worker.podSecurityContext | nindent 8 }}
hostNetwork: {{ .Values.worker.hostNetwork }}
containers:
- name: worker
securityContext:
{{- toYaml .Values.worker.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe:
{{- toYaml .Values.worker.livenessProbe | nindent 12 }}
readinessProbe:
{{- toYaml .Values.worker.readinessProbe | nindent 12 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
{{- with .Values.worker.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end}}
resources:
{{- toYaml .Values.worker.resources | nindent 12 }}
command:
- "nfd-worker"
args:
{{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }}
- "-server={{ include "node-feature-discovery.fullname" . }}-master:{{ .Values.master.service.port }}"
{{- end }}
{{- if .Values.tls.enable }}
- "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt"
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
# Go over featureGate and add the feature-gate flag
{{- range $key, $value := .Values.featureGates }}
- "-feature-gates={{ $key }}={{ $value }}"
{{- end }}
- "-metrics={{ .Values.worker.metricsPort | default "8081"}}"
- "-grpc-health={{ .Values.worker.healthPort | default "8082" }}"
ports:
- containerPort: {{ .Values.worker.metricsPort | default "8081"}}
name: metrics
- containerPort: {{ .Values.worker.healthPort | default "8082" }}
name: health
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
readOnly: true
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
- name: host-sys
mountPath: "/host-sys"
readOnly: true
- name: host-usr-lib
mountPath: "/host-usr/lib"
readOnly: true
- name: host-lib
mountPath: "/host-lib"
readOnly: true
- name: host-proc-swaps
mountPath: "/host-proc/swaps"
readOnly: true
{{- if .Values.worker.mountUsrSrc }}
- name: host-usr-src
mountPath: "/host-usr/src"
readOnly: true
{{- end }}
- name: source-d
mountPath: "/etc/kubernetes/node-feature-discovery/source.d/"
readOnly: true
- name: features-d
mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
readOnly: true
- name: nfd-worker-conf
mountPath: "/etc/kubernetes/node-feature-discovery"
readOnly: true
{{- if .Values.tls.enable }}
- name: nfd-worker-cert
mountPath: "/etc/kubernetes/node-feature-discovery/certs"
readOnly: true
{{- end }}
volumes:
- name: host-boot
hostPath:
path: "/boot"
- name: host-os-release
hostPath:
path: "/etc/os-release"
- name: host-sys
hostPath:
path: "/sys"
- name: host-usr-lib
hostPath:
path: "/usr/lib"
- name: host-lib
hostPath:
path: "/lib"
- name: host-proc-swaps
hostPath:
path: "/proc/swaps"
{{- if .Values.worker.mountUsrSrc }}
- name: host-usr-src
hostPath:
path: "/usr/src"
{{- end }}
- name: source-d
hostPath:
path: "/etc/kubernetes/node-feature-discovery/source.d/"
- name: features-d
hostPath:
path: "/etc/kubernetes/node-feature-discovery/features.d/"
- name: nfd-worker-conf
configMap:
name: {{ include "node-feature-discovery.fullname" . }}-worker-conf
items:
- key: nfd-worker.conf
path: nfd-worker.conf
{{- if .Values.tls.enable }}
- name: nfd-worker-cert
secret:
secretName: nfd-worker-cert
{{- end }}
{{- with .Values.worker.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.worker.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.worker.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.worker.priorityClassName }}
priorityClassName: {{ . | quote }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,593 @@
image:
repository: registry.k8s.io/nfd/node-feature-discovery
# This should be set to 'IfNotPresent' for released version
pullPolicy: IfNotPresent
# tag, if defined will use the given image tag, else Chart.AppVersion will be used
# tag
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
namespaceOverride: ""
enableNodeFeatureApi: true
featureGates:
NodeFeatureAPI: true
NodeFeatureGroupAPI: false
priorityClassName: ""
master:
enable: true
extraEnvs: []
hostNetwork: false
config: ### <NFD-MASTER-CONF-START-DO-NOT-REMOVE>
# noPublish: false
# autoDefaultNs: true
# extraLabelNs: ["added.ns.io","added.kubernets.io"]
# denyLabelNs: ["denied.ns.io","denied.kubernetes.io"]
# resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"]
# enableTaints: false
# labelWhiteList: "foo"
# resyncPeriod: "2h"
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time configurable
## and require a nfd-master restart to take effect after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
# leaderElection:
# leaseDuration: 15s
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
# renewDeadline: 10s
# # this value has to be greater than 0
# retryPeriod: 2s
# nfdApiParallelism: 10
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
# The TCP port that nfd-master listens for incoming requests. Default: 8080
# Deprecated this parameter is related to the deprecated gRPC API and will
# be removed with it in a future release
port: 8080
metricsPort: 8081
healthPort: 8082
instance:
featureApi:
resyncPeriod:
denyLabelNs: []
extraLabelNs: []
resourceLabels: []
enableTaints: false
crdController: null
featureRulesController: null
nfdApiParallelism: null
deploymentAnnotations: {}
replicaCount: 1
podSecurityContext: {}
# fsGroup: 2000
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsNonRoot: true
# runAsUser: 1000
serviceAccount:
# Specifies whether a service account should be created
create: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name:
# specify how many old ReplicaSets for the Deployment to retain.
revisionHistoryLimit:
rbac:
create: true
service:
type: ClusterIP
port: 8080
resources:
limits:
memory: 4Gi
requests:
cpu: 100m
# You may want to use the same value for `requests.memory` and `limits.memory`. The “requests” value affects scheduling to accommodate pods on nodes.
# If there is a large difference between “requests” and “limits” and nodes experience memory pressure, the kernel may invoke
# the OOM Killer, even if the memory does not exceed the “limits” threshold. This can cause unexpected pod evictions. Memory
# cannot be compressed and once allocated to a pod, it can only be reclaimed by killing the pod.
# Natan Yellin 22/09/2022 https://home.robusta.dev/blog/kubernetes-memory-limit
memory: 128Mi
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "node-role.kubernetes.io/control-plane"
operator: "Equal"
value: ""
effect: "NoSchedule"
annotations: {}
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: "node-role.kubernetes.io/master"
operator: In
values: [""]
- weight: 1
preference:
matchExpressions:
- key: "node-role.kubernetes.io/control-plane"
operator: In
values: [""]
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
# failureThreshold: 3
# periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
# periodSeconds: 10
worker:
enable: true
extraEnvs: []
hostNetwork: false
config: ### <NFD-WORKER-CONF-START-DO-NOT-REMOVE>
#core:
# labelWhiteList:
# noPublish: false
# sleepInterval: 60s
# featureSources: [all]
# labelSources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time configurable
## and require a nfd-worker restart to take effect after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
#sources:
# cpu:
# cpuid:
## NOTE: whitelist has priority over blacklist
# attributeBlacklist:
# - "AVX10"
# - "BMI1"
# - "BMI2"
# - "CLMUL"
# - "CMOV"
# - "CX16"
# - "ERMS"
# - "F16C"
# - "HTT"
# - "LZCNT"
# - "MMX"
# - "MMXEXT"
# - "NX"
# - "POPCNT"
# - "RDRAND"
# - "RDSEED"
# - "RDTSCP"
# - "SGX"
# - "SSE"
# - "SSE2"
# - "SSE3"
# - "SSE4"
# - "SSE42"
# - "SSSE3"
# - "TDX_GUEST"
# attributeWhitelist:
# kernel:
# kconfigFile: "/path/to/kconfig"
# configOpts:
# - "NO_HZ"
# - "X86"
# - "DMI"
# pci:
# deviceClassWhitelist:
# - "0200"
# - "03"
# - "12"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# - "subsystem_vendor"
# - "subsystem_device"
# usb:
# deviceClassWhitelist:
# - "0e"
# - "ef"
# - "fe"
# - "ff"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# local:
# hooksEnabled: false
# custom:
# # The following feature demonstrates the capabilities of the matchFeatures
# - name: "my custom rule"
# labels:
# "vendor.io/my-ng-feature": "true"
# # matchFeatures implements a logical AND over all matcher terms in the
# # list (i.e. all of the terms, or per-feature matchers, must match)
# matchFeatures:
# - feature: cpu.cpuid
# matchExpressions:
# AVX512F: {op: Exists}
# - feature: cpu.cstate
# matchExpressions:
# enabled: {op: IsTrue}
# - feature: cpu.pstate
# matchExpressions:
# no_turbo: {op: IsFalse}
# scaling_governor: {op: In, value: ["performance"]}
# - feature: cpu.rdt
# matchExpressions:
# RDTL3CA: {op: Exists}
# - feature: cpu.sst
# matchExpressions:
# bf.enabled: {op: IsTrue}
# - feature: cpu.topology
# matchExpressions:
# hardware_multithreading: {op: IsFalse}
#
# - feature: kernel.config
# matchExpressions:
# X86: {op: Exists}
# LSM: {op: InRegexp, value: ["apparmor"]}
# - feature: kernel.loadedmodule
# matchExpressions:
# e1000e: {op: Exists}
# - feature: kernel.selinux
# matchExpressions:
# enabled: {op: IsFalse}
# - feature: kernel.version
# matchExpressions:
# major: {op: In, value: ["5"]}
# minor: {op: Gt, value: ["10"]}
#
# - feature: storage.block
# matchExpressions:
# rotational: {op: In, value: ["0"]}
# dax: {op: In, value: ["0"]}
#
# - feature: network.device
# matchExpressions:
# operstate: {op: In, value: ["up"]}
# speed: {op: Gt, value: ["100"]}
#
# - feature: memory.numa
# matchExpressions:
# node_count: {op: Gt, value: ["2"]}
# - feature: memory.nv
# matchExpressions:
# devtype: {op: In, value: ["nd_dax"]}
# mode: {op: In, value: ["memory"]}
#
# - feature: system.osrelease
# matchExpressions:
# ID: {op: In, value: ["fedora", "centos"]}
# - feature: system.name
# matchExpressions:
# nodename: {op: InRegexp, value: ["^worker-X"]}
#
# - feature: local.label
# matchExpressions:
# custom-feature-knob: {op: Gt, value: ["100"]}
#
# # The following feature demonstrates the capabilities of the matchAny
# - name: "my matchAny rule"
# labels:
# "vendor.io/my-ng-feature-2": "my-value"
# # matchAny implements a logical IF over all elements (sub-matchers) in
# # the list (i.e. at least one feature matcher must match)
# matchAny:
# - matchFeatures:
# - feature: kernel.loadedmodule
# matchExpressions:
# driver-module-X: {op: Exists}
# - feature: pci.device
# matchExpressions:
# vendor: {op: In, value: ["8086"]}
# class: {op: In, value: ["0200"]}
# - matchFeatures:
# - feature: kernel.loadedmodule
# matchExpressions:
# driver-module-Y: {op: Exists}
# - feature: usb.device
# matchExpressions:
# vendor: {op: In, value: ["8086"]}
# class: {op: In, value: ["02"]}
#
# - name: "avx wildcard rule"
# labels:
# "my-avx-feature": "true"
# matchFeatures:
# - feature: cpu.cpuid
# matchName: {op: InRegexp, value: ["^AVX512"]}
#
# # The following features demonstreate label templating capabilities
# - name: "my template rule"
# labelsTemplate: |
# {{ range .system.osrelease }}vendor.io/my-system-feature.{{ .Name }}={{ .Value }}
# {{ end }}
# matchFeatures:
# - feature: system.osrelease
# matchExpressions:
# ID: {op: InRegexp, value: ["^open.*"]}
# VERSION_ID.major: {op: In, value: ["13", "15"]}
#
# - name: "my template rule 2"
# labelsTemplate: |
# {{ range .pci.device }}vendor.io/my-pci-device.{{ .class }}-{{ .device }}=with-cpuid
# {{ end }}
# matchFeatures:
# - feature: pci.device
# matchExpressions:
# class: {op: InRegexp, value: ["^06"]}
# vendor: ["8086"]
# - feature: cpu.cpuid
# matchExpressions:
# AVX: {op: Exists}
#
# # The following examples demonstrate vars field and back-referencing
# # previous labels and vars
# - name: "my dummy kernel rule"
# labels:
# "vendor.io/my.kernel.feature": "true"
# matchFeatures:
# - feature: kernel.version
# matchExpressions:
# major: {op: Gt, value: ["2"]}
#
# - name: "my dummy rule with no labels"
# vars:
# "my.dummy.var": "1"
# matchFeatures:
# - feature: cpu.cpuid
# matchExpressions: {}
#
# - name: "my rule using backrefs"
# labels:
# "vendor.io/my.backref.feature": "true"
# matchFeatures:
# - feature: rule.matched
# matchExpressions:
# vendor.io/my.kernel.feature: {op: IsTrue}
# my.dummy.var: {op: Gt, value: ["0"]}
#
# - name: "kconfig template rule"
# labelsTemplate: |
# {{ range .kernel.config }}kconfig-{{ .Name }}={{ .Value }}
# {{ end }}
# matchFeatures:
# - feature: kernel.config
# matchName: {op: In, value: ["SWAP", "X86", "ARM"]}
### <NFD-WORKER-CONF-END-DO-NOT-REMOVE>
metricsPort: 8081
healthPort: 8082
daemonsetAnnotations: {}
podSecurityContext: {}
# fsGroup: 2000
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsNonRoot: true
# runAsUser: 1000
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
# failureThreshold: 3
# periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
# periodSeconds: 10
serviceAccount:
# Specifies whether a service account should be created.
# We create this by default to make it easier for downstream users to apply PodSecurityPolicies.
create: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name:
# specify how many old ControllerRevisions for the DaemonSet to retain.
revisionHistoryLimit:
rbac:
create: true
# Allow users to mount the hostPath /usr/src, useful for RHCOS on s390x
# Does not work on systems without /usr/src AND a read-only /usr, such as Talos
mountUsrSrc: false
resources:
limits:
memory: 512Mi
requests:
cpu: 5m
memory: 64Mi
nodeSelector: {}
tolerations: []
annotations: {}
affinity: {}
priorityClassName: ""
topologyUpdater:
config: ### <NFD-TOPOLOGY-UPDATER-CONF-START-DO-NOT-REMOVE>
## key = node name, value = list of resources to be excluded.
## use * to exclude from all nodes.
## an example for how the exclude list should looks like
#excludeList:
# node1: [cpu]
# node2: [memory, example/deviceA]
# *: [hugepages-2Mi]
### <NFD-TOPOLOGY-UPDATER-CONF-END-DO-NOT-REMOVE>
enable: false
createCRDs: false
extraEnvs: []
hostNetwork: false
serviceAccount:
create: true
annotations: {}
name:
# specify how many old ControllerRevisions for the DaemonSet to retain.
revisionHistoryLimit:
rbac:
create: true
metricsPort: 8081
healthPort: 8082
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s
watchNamespace: "*"
kubeletStateDir: /var/lib/kubelet
podSecurityContext: {}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsUser: 0
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
# failureThreshold: 3
# periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
# periodSeconds: 10
resources:
limits:
memory: 60Mi
requests:
cpu: 50m
memory: 40Mi
nodeSelector: {}
tolerations: []
annotations: {}
daemonsetAnnotations: {}
affinity: {}
podSetFingerprint: true
gc:
enable: true
extraEnvs: []
hostNetwork: false
replicaCount: 1
serviceAccount:
create: true
annotations: {}
name:
rbac:
create: true
interval: 1h
podSecurityContext: {}
resources:
limits:
memory: 1Gi
requests:
cpu: 10m
memory: 128Mi
metricsPort: 8081
nodeSelector: {}
tolerations: []
annotations: {}
deploymentAnnotations: {}
affinity: {}
# specify how many old ReplicaSets for the Deployment to retain.
revisionHistoryLimit:
# Optionally use encryption for worker <--> master comms
# TODO: verify hostname is not yet supported
#
# If you do not enable certManager (and have it installed) you will
# need to manually, or otherwise, provision the TLS certs as secrets
tls:
enable: false
certManager: false
certManagerCertificate:
issuerKind:
issuerName:
prometheus:
enable: false
scrapeInterval: 10s
labels: {}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,797 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.4
name: nvidiadrivers.nvidia.com
spec:
group: nvidia.com
names:
kind: NVIDIADriver
listKind: NVIDIADriverList
plural: nvidiadrivers
shortNames:
- nvd
- nvdriver
- nvdrivers
singular: nvidiadriver
scope: Cluster
versions:
- additionalPrinterColumns:
- jsonPath: .status.state
name: Status
type: string
- jsonPath: .metadata.creationTimestamp
name: Age
type: string
name: v1alpha1
schema:
openAPIV3Schema:
description: NVIDIADriver is the Schema for the nvidiadrivers API
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
properties:
annotations:
additionalProperties:
type: string
description: |-
Optional: Annotations is an unstructured key value map stored with a resource that may be
set by external tools to store and retrieve arbitrary metadata. They are not
queryable and should be preserved when modifying objects.
type: object
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
certConfig:
description: 'Optional: Custom certificates configuration for NVIDIA
Driver container'
properties:
name:
type: string
type: object
driverType:
default: gpu
description: DriverType defines NVIDIA driver type
enum:
- gpu
- vgpu
- vgpu-host-manager
type: string
x-kubernetes-validations:
- message: driverType is an immutable field. Please create a new NvidiaDriver
resource instead when you want to change this setting.
rule: self == oldSelf
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present in
a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
gdrcopy:
description: GDRCopy defines the spec for GDRCopy driver
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: GDRCopy diver image repository
type: string
version:
description: GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS driver
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GPUDirect Storage is enabled
through GPU operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: NVIDIA GPUDirect Storage Driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: NVIDIA GPUDirect Storage Driver image repository
type: string
version:
description: NVIDIA GPUDirect Storage Driver image tag
type: string
type: object
image:
default: nvcr.io/nvidia/driver
description: NVIDIA Driver container image name
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
kernelModuleConfig:
description: 'Optional: Kernel module configuration parameters for
the NVIDIA Driver'
properties:
name:
type: string
type: object
labels:
additionalProperties:
type: string
description: |-
Optional: Map of string keys and values that can be used to organize and categorize
(scope and select) objects. May match selectors of replication controllers
and services.
type: object
licensingConfig:
description: 'Optional: Licensing configuration for NVIDIA vGPU licensing'
properties:
name:
type: string
nlsEnabled:
description: NLSEnabled indicates if NVIDIA Licensing System is
used for licensing.
type: boolean
type: object
livenessProbe:
description: NVIDIA Driver container liveness probe settings
properties:
failureThreshold:
description: |-
Minimum consecutive failures for the probe to be considered failed after having succeeded.
Defaults to 3. Minimum value is 1.
format: int32
minimum: 1
type: integer
initialDelaySeconds:
description: |-
Number of seconds after the container has started before liveness probes are initiated.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
format: int32
type: integer
periodSeconds:
description: |-
How often (in seconds) to perform the probe.
Default to 10 seconds. Minimum value is 1.
format: int32
minimum: 1
type: integer
successThreshold:
description: |-
Minimum consecutive successes for the probe to be considered successful after having failed.
Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
format: int32
minimum: 1
type: integer
timeoutSeconds:
description: |-
Number of seconds after which the probe times out.
Defaults to 1 second. Minimum value is 1.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
format: int32
minimum: 1
type: integer
type: object
manager:
description: Manager represents configuration for NVIDIA Driver Manager
initContainer
properties:
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: Image represents NVIDIA Driver Manager image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: Repository represents Driver Managerrepository path
type: string
version:
description: Version represents NVIDIA Driver Manager image tag(version)
type: string
type: object
nodeAffinity:
description: Affinity specifies node affinity rules for driver pods
properties:
preferredDuringSchedulingIgnoredDuringExecution:
description: |-
The scheduler will prefer to schedule pods to nodes that satisfy
the affinity expressions specified by this field, but it may choose
a node that violates one or more of the expressions. The node that is
most preferred is the one with the greatest sum of weights, i.e.
for each node that meets all of the scheduling requirements (resource
request, requiredDuringScheduling affinity expressions, etc.),
compute a sum by iterating through the elements of this field and adding
"weight" to the sum if the node matches the corresponding matchExpressions; the
node(s) with the highest sum are the most preferred.
items:
description: |-
An empty preferred scheduling term matches all objects with implicit weight 0
(i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).
properties:
preference:
description: A node selector term, associated with the corresponding
weight.
properties:
matchExpressions:
description: A list of node selector requirements by
node's labels.
items:
description: |-
A node selector requirement is a selector that contains values, a key, and an operator
that relates the key and values.
properties:
key:
description: The label key that the selector applies
to.
type: string
operator:
description: |-
Represents a key's relationship to a set of values.
Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
type: string
values:
description: |-
An array of string values. If the operator is In or NotIn,
the values array must be non-empty. If the operator is Exists or DoesNotExist,
the values array must be empty. If the operator is Gt or Lt, the values
array must have a single element, which will be interpreted as an integer.
This array is replaced during a strategic merge patch.
items:
type: string
type: array
x-kubernetes-list-type: atomic
required:
- key
- operator
type: object
type: array
x-kubernetes-list-type: atomic
matchFields:
description: A list of node selector requirements by
node's fields.
items:
description: |-
A node selector requirement is a selector that contains values, a key, and an operator
that relates the key and values.
properties:
key:
description: The label key that the selector applies
to.
type: string
operator:
description: |-
Represents a key's relationship to a set of values.
Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
type: string
values:
description: |-
An array of string values. If the operator is In or NotIn,
the values array must be non-empty. If the operator is Exists or DoesNotExist,
the values array must be empty. If the operator is Gt or Lt, the values
array must have a single element, which will be interpreted as an integer.
This array is replaced during a strategic merge patch.
items:
type: string
type: array
x-kubernetes-list-type: atomic
required:
- key
- operator
type: object
type: array
x-kubernetes-list-type: atomic
type: object
x-kubernetes-map-type: atomic
weight:
description: Weight associated with matching the corresponding
nodeSelectorTerm, in the range 1-100.
format: int32
type: integer
required:
- preference
- weight
type: object
type: array
x-kubernetes-list-type: atomic
requiredDuringSchedulingIgnoredDuringExecution:
description: |-
If the affinity requirements specified by this field are not met at
scheduling time, the pod will not be scheduled onto the node.
If the affinity requirements specified by this field cease to be met
at some point during pod execution (e.g. due to an update), the system
may or may not try to eventually evict the pod from its node.
properties:
nodeSelectorTerms:
description: Required. A list of node selector terms. The
terms are ORed.
items:
description: |-
A null or empty node selector term matches no objects. The requirements of
them are ANDed.
The TopologySelectorTerm type implements a subset of the NodeSelectorTerm.
properties:
matchExpressions:
description: A list of node selector requirements by
node's labels.
items:
description: |-
A node selector requirement is a selector that contains values, a key, and an operator
that relates the key and values.
properties:
key:
description: The label key that the selector applies
to.
type: string
operator:
description: |-
Represents a key's relationship to a set of values.
Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
type: string
values:
description: |-
An array of string values. If the operator is In or NotIn,
the values array must be non-empty. If the operator is Exists or DoesNotExist,
the values array must be empty. If the operator is Gt or Lt, the values
array must have a single element, which will be interpreted as an integer.
This array is replaced during a strategic merge patch.
items:
type: string
type: array
x-kubernetes-list-type: atomic
required:
- key
- operator
type: object
type: array
x-kubernetes-list-type: atomic
matchFields:
description: A list of node selector requirements by
node's fields.
items:
description: |-
A node selector requirement is a selector that contains values, a key, and an operator
that relates the key and values.
properties:
key:
description: The label key that the selector applies
to.
type: string
operator:
description: |-
Represents a key's relationship to a set of values.
Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
type: string
values:
description: |-
An array of string values. If the operator is In or NotIn,
the values array must be non-empty. If the operator is Exists or DoesNotExist,
the values array must be empty. If the operator is Gt or Lt, the values
array must have a single element, which will be interpreted as an integer.
This array is replaced during a strategic merge patch.
items:
type: string
type: array
x-kubernetes-list-type: atomic
required:
- key
- operator
type: object
type: array
x-kubernetes-list-type: atomic
type: object
x-kubernetes-map-type: atomic
type: array
x-kubernetes-list-type: atomic
required:
- nodeSelectorTerms
type: object
x-kubernetes-map-type: atomic
type: object
nodeSelector:
additionalProperties:
type: string
description: NodeSelector specifies a selector for installation of
NVIDIA driver
type: object
priorityClassName:
description: 'Optional: Set priorityClassName'
type: string
rdma:
description: GPUDirectRDMA defines the spec for NVIDIA Peer Memory
driver
properties:
enabled:
description: Enabled indicates if GPUDirect RDMA is enabled through
GPU operator
type: boolean
useHostMofed:
description: UseHostMOFED indicates to use MOFED drivers directly
installed on the host to enable GPUDirect RDMA
type: boolean
type: object
readinessProbe:
description: NVIDIA Driver container readiness probe settings
properties:
failureThreshold:
description: |-
Minimum consecutive failures for the probe to be considered failed after having succeeded.
Defaults to 3. Minimum value is 1.
format: int32
minimum: 1
type: integer
initialDelaySeconds:
description: |-
Number of seconds after the container has started before liveness probes are initiated.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
format: int32
type: integer
periodSeconds:
description: |-
How often (in seconds) to perform the probe.
Default to 10 seconds. Minimum value is 1.
format: int32
minimum: 1
type: integer
successThreshold:
description: |-
Minimum consecutive successes for the probe to be considered successful after having failed.
Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
format: int32
minimum: 1
type: integer
timeoutSeconds:
description: |-
Number of seconds after which the probe times out.
Defaults to 1 second. Minimum value is 1.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
format: int32
minimum: 1
type: integer
type: object
repoConfig:
description: 'Optional: Custom repo configuration for NVIDIA Driver
container'
properties:
name:
type: string
type: object
repository:
description: NVIDIA Driver repository
type: string
resources:
description: 'Optional: Define resources requests and limits for each
pod'
properties:
limits:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: |-
Limits describes the maximum amount of compute resources allowed.
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
requests:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: |-
Requests describes the minimum amount of compute resources required.
If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
otherwise to an implementation-defined value. Requests cannot exceed Limits.
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
startupProbe:
description: NVIDIA Driver container startup probe settings
properties:
failureThreshold:
description: |-
Minimum consecutive failures for the probe to be considered failed after having succeeded.
Defaults to 3. Minimum value is 1.
format: int32
minimum: 1
type: integer
initialDelaySeconds:
description: |-
Number of seconds after the container has started before liveness probes are initiated.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
format: int32
type: integer
periodSeconds:
description: |-
How often (in seconds) to perform the probe.
Default to 10 seconds. Minimum value is 1.
format: int32
minimum: 1
type: integer
successThreshold:
description: |-
Minimum consecutive successes for the probe to be considered successful after having failed.
Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
format: int32
minimum: 1
type: integer
timeoutSeconds:
description: |-
Number of seconds after which the probe times out.
Defaults to 1 second. Minimum value is 1.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
format: int32
minimum: 1
type: integer
type: object
tolerations:
description: 'Optional: Set tolerations'
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA Driver
using pre-compiled modules is enabled
type: boolean
x-kubernetes-validations:
- message: usePrecompiled is an immutable field. Please create a new
NvidiaDriver resource instead when you want to change this setting.
rule: self == oldSelf
version:
description: NVIDIA Driver version (or just branch for precompiled
drivers)
type: string
virtualTopologyConfig:
description: 'Optional: Virtual Topology Daemon configuration for
NVIDIA vGPU drivers'
properties:
name:
description: 'Optional: Config name representing virtual topology
daemon configuration file nvidia-topologyd.conf'
type: string
type: object
required:
- driverType
- image
type: object
status:
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
properties:
conditions:
description: Conditions is a list of conditions representing the NVIDIADriver's
current state.
items:
description: Condition contains details for one aspect of the current
state of this API Resource.
properties:
lastTransitionTime:
description: |-
lastTransitionTime is the last time the condition transitioned from one status to another.
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
format: date-time
type: string
message:
description: |-
message is a human readable message indicating details about the transition.
This may be an empty string.
maxLength: 32768
type: string
observedGeneration:
description: |-
observedGeneration represents the .metadata.generation that the condition was set based upon.
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
with respect to the current state of the instance.
format: int64
minimum: 0
type: integer
reason:
description: |-
reason contains a programmatic identifier indicating the reason for the condition's last transition.
Producers of specific condition types may define expected values and meanings for this field,
and whether the values are considered a guaranteed API.
The value should be a CamelCase string.
This field may not be empty.
maxLength: 1024
minLength: 1
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
type: string
status:
description: status of the condition, one of True, False, Unknown.
enum:
- "True"
- "False"
- Unknown
type: string
type:
description: type of condition in CamelCase or in foo.example.com/CamelCase.
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
required:
- lastTransitionTime
- message
- reason
- status
- type
type: object
type: array
namespace:
description: Namespace indicates a namespace in which the operator
and driver are installed
type: string
state:
description: |-
INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
Important: Run "make" to regenerate code after modifying this file
State indicates status of NVIDIADriver instance
enum:
- ignored
- ready
- notReady
type: string
required:
- state
type: object
type: object
served: true
storage: true
subresources:
status: {}

View file

@ -0,0 +1,80 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "gpu-operator.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "gpu-operator.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "gpu-operator.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Common labels
*/}}
{{- define "gpu-operator.labels" -}}
app.kubernetes.io/name: {{ include "gpu-operator.name" . }}
helm.sh/chart: {{ include "gpu-operator.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- if .Values.operator.labels }}
{{ toYaml .Values.operator.labels }}
{{- end }}
{{- end -}}
{{- define "gpu-operator.operand-labels" -}}
helm.sh/chart: {{ include "gpu-operator.chart" . }}
app.kubernetes.io/managed-by: {{ include "gpu-operator.name" . }}
{{- if .Values.daemonsets.labels }}
{{ toYaml .Values.daemonsets.labels }}
{{- end }}
{{- end -}}
{{- define "gpu-operator.matchLabels" -}}
app.kubernetes.io/name: {{ include "gpu-operator.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end -}}
{{/*
Full image name with tag
*/}}
{{- define "gpu-operator.fullimage" -}}
{{- .Values.operator.repository -}}/{{- .Values.operator.image -}}:{{- .Values.operator.version | default .Chart.AppVersion -}}
{{- end }}
{{/*
Full image name with tag
*/}}
{{- define "driver-manager.fullimage" -}}
{{- .Values.driver.manager.repository -}}/{{- .Values.driver.manager.image -}}:{{- .Values.driver.manager.version -}}
{{- end }}

View file

@ -0,0 +1,45 @@
{{- if .Values.operator.cleanupCRD }}
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-operator-cleanup-crd
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-delete
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
template:
metadata:
name: gpu-operator-cleanup-crd
labels:
{{- include "gpu-operator.labels" . | nindent 8 }}
app.kubernetes.io/component: "gpu-operator"
spec:
serviceAccountName: gpu-operator
{{- if .Values.operator.imagePullSecrets }}
imagePullSecrets:
{{- range .Values.operator.imagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
{{- with .Values.operator.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: cleanup-crd
image: {{ include "gpu-operator.fullimage" . }}
imagePullPolicy: {{ .Values.operator.imagePullPolicy }}
command:
- /bin/sh
- -c
- >
kubectl delete clusterpolicy cluster-policy;
kubectl delete crd clusterpolicies.nvidia.com;
restartPolicy: OnFailure
{{- end }}

View file

@ -0,0 +1,683 @@
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: cluster-policy
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
{{- if .Values.operator.cleanupCRD }}
# CR cleanup is handled during pre-delete hook
# Add below annotation so that helm doesn't attempt to cleanup CR twice
annotations:
"helm.sh/resource-policy": keep
{{- end }}
spec:
hostPaths:
rootFS: {{ .Values.hostPaths.rootFS }}
driverInstallDir: {{ .Values.hostPaths.driverInstallDir }}
operator:
{{- if .Values.operator.defaultRuntime }}
defaultRuntime: {{ .Values.operator.defaultRuntime }}
{{- end }}
{{- if .Values.operator.runtimeClass }}
runtimeClass: {{ .Values.operator.runtimeClass }}
{{- end }}
{{- if .Values.operator.defaultGPUMode }}
defaultGPUMode: {{ .Values.operator.defaultGPUMode }}
{{- end }}
{{- if .Values.operator.initContainer }}
initContainer:
{{- if .Values.operator.initContainer.repository }}
repository: {{ .Values.operator.initContainer.repository }}
{{- end }}
{{- if .Values.operator.initContainer.image }}
image: {{ .Values.operator.initContainer.image }}
{{- end }}
{{- if .Values.operator.initContainer.version }}
version: {{ .Values.operator.initContainer.version | quote }}
{{- end }}
{{- if .Values.operator.initContainer.imagePullPolicy }}
imagePullPolicy: {{ .Values.operator.initContainer.imagePullPolicy }}
{{- end }}
{{- if .Values.operator.initContainer.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.operator.initContainer.imagePullSecrets | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.operator.use_ocp_driver_toolkit }}
use_ocp_driver_toolkit: {{ .Values.operator.use_ocp_driver_toolkit }}
{{- end }}
daemonsets:
labels:
{{- include "gpu-operator.operand-labels" . | nindent 6 }}
{{- if .Values.daemonsets.annotations }}
annotations: {{ toYaml .Values.daemonsets.annotations | nindent 6 }}
{{- end }}
{{- if .Values.daemonsets.tolerations }}
tolerations: {{ toYaml .Values.daemonsets.tolerations | nindent 6 }}
{{- end }}
{{- if .Values.daemonsets.priorityClassName }}
priorityClassName: {{ .Values.daemonsets.priorityClassName }}
{{- end }}
{{- if .Values.daemonsets.updateStrategy }}
updateStrategy: {{ .Values.daemonsets.updateStrategy }}
{{- end }}
{{- if .Values.daemonsets.rollingUpdate }}
rollingUpdate:
maxUnavailable: {{ .Values.daemonsets.rollingUpdate.maxUnavailable | quote }}
{{- end }}
validator:
{{- if .Values.validator.repository }}
repository: {{ .Values.validator.repository }}
{{- end }}
{{- if .Values.validator.image }}
image: {{ .Values.validator.image }}
{{- end }}
version: {{ .Values.validator.version | default .Chart.AppVersion | quote }}
{{- if .Values.validator.imagePullPolicy }}
imagePullPolicy: {{ .Values.validator.imagePullPolicy }}
{{- end }}
{{- if .Values.validator.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.validator.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if .Values.validator.resources }}
resources: {{ toYaml .Values.validator.resources | nindent 6 }}
{{- end }}
{{- if .Values.validator.env }}
env: {{ toYaml .Values.validator.env | nindent 6 }}
{{- end }}
{{- if .Values.validator.args }}
args: {{ toYaml .Values.validator.args | nindent 6 }}
{{- end }}
{{- if .Values.validator.plugin }}
plugin:
{{- if .Values.validator.plugin.env }}
env: {{ toYaml .Values.validator.plugin.env | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.validator.cuda }}
cuda:
{{- if .Values.validator.cuda.env }}
env: {{ toYaml .Values.validator.cuda.env | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.validator.driver }}
driver:
{{- if .Values.validator.driver.env }}
env: {{ toYaml .Values.validator.driver.env | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.validator.toolkit }}
toolkit:
{{- if .Values.validator.toolkit.env }}
env: {{ toYaml .Values.validator.toolkit.env | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.validator.vfioPCI }}
vfioPCI:
{{- if .Values.validator.vfioPCI.env }}
env: {{ toYaml .Values.validator.vfioPCI.env | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.validator.vgpuManager }}
vgpuManager:
{{- if .Values.validator.vgpuManager.env }}
env: {{ toYaml .Values.validator.vgpuManager.env | nindent 8 }}
{{- end }}
{{- end }}
{{- if .Values.validator.vgpuDevices }}
vgpuDevices:
{{- if .Values.validator.vgpuDevices.env }}
env: {{ toYaml .Values.validator.vgpuDevices.env | nindent 8 }}
{{- end }}
{{- end }}
mig:
{{- if .Values.mig.strategy }}
strategy: {{ .Values.mig.strategy }}
{{- end }}
psa:
enabled: {{ .Values.psa.enabled }}
cdi:
enabled: {{ .Values.cdi.enabled }}
default: {{ .Values.cdi.default }}
driver:
enabled: {{ .Values.driver.enabled }}
useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }}
useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }}
usePrecompiled: {{ .Values.driver.usePrecompiled }}
{{- if .Values.driver.repository }}
repository: {{ .Values.driver.repository }}
{{- end }}
{{- if .Values.driver.image }}
image: {{ .Values.driver.image }}
{{- end }}
{{- if .Values.driver.version }}
version: {{ .Values.driver.version | quote }}
{{- end }}
{{- if .Values.driver.imagePullPolicy }}
imagePullPolicy: {{ .Values.driver.imagePullPolicy }}
{{- end }}
{{- if .Values.driver.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.driver.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.driver.startupProbe }}
startupProbe: {{ toYaml .Values.driver.startupProbe | nindent 6 }}
{{- end }}
{{- if .Values.driver.livenessProbe }}
livenessProbe: {{ toYaml .Values.driver.livenessProbe | nindent 6 }}
{{- end }}
{{- if .Values.driver.readinessProbe }}
readinessProbe: {{ toYaml .Values.driver.readinessProbe | nindent 6 }}
{{- end }}
rdma:
enabled: {{ .Values.driver.rdma.enabled }}
useHostMofed: {{ .Values.driver.rdma.useHostMofed }}
manager:
{{- if .Values.driver.manager.repository }}
repository: {{ .Values.driver.manager.repository }}
{{- end }}
{{- if .Values.driver.manager.image }}
image: {{ .Values.driver.manager.image }}
{{- end }}
{{- if .Values.driver.manager.version }}
version: {{ .Values.driver.manager.version | quote }}
{{- end }}
{{- if .Values.driver.manager.imagePullPolicy }}
imagePullPolicy: {{ .Values.driver.manager.imagePullPolicy }}
{{- end }}
{{- if .Values.driver.manager.env }}
env: {{ toYaml .Values.driver.manager.env | nindent 8 }}
{{- end }}
{{- if .Values.driver.repoConfig }}
repoConfig: {{ toYaml .Values.driver.repoConfig | nindent 6 }}
{{- end }}
{{- if .Values.driver.certConfig }}
certConfig: {{ toYaml .Values.driver.certConfig | nindent 6 }}
{{- end }}
{{- if .Values.driver.licensingConfig }}
licensingConfig: {{ toYaml .Values.driver.licensingConfig | nindent 6 }}
{{- end }}
{{- if .Values.driver.virtualTopology }}
virtualTopology: {{ toYaml .Values.driver.virtualTopology | nindent 6 }}
{{- end }}
{{- if .Values.driver.kernelModuleConfig }}
kernelModuleConfig: {{ toYaml .Values.driver.kernelModuleConfig | nindent 6 }}
{{- end }}
{{- if .Values.driver.resources }}
resources: {{ toYaml .Values.driver.resources | nindent 6 }}
{{- end }}
{{- if .Values.driver.env }}
env: {{ toYaml .Values.driver.env | nindent 6 }}
{{- end }}
{{- if .Values.driver.args }}
args: {{ toYaml .Values.driver.args | nindent 6 }}
{{- end }}
{{- if .Values.driver.upgradePolicy }}
upgradePolicy:
autoUpgrade: {{ .Values.driver.upgradePolicy.autoUpgrade | default false }}
maxParallelUpgrades: {{ .Values.driver.upgradePolicy.maxParallelUpgrades | default 0 }}
maxUnavailable : {{ .Values.driver.upgradePolicy.maxUnavailable | default "25%" }}
waitForCompletion:
timeoutSeconds: {{ .Values.driver.upgradePolicy.waitForCompletion.timeoutSeconds }}
{{- if .Values.driver.upgradePolicy.waitForCompletion.podSelector }}
podSelector: {{ .Values.driver.upgradePolicy.waitForCompletion.podSelector }}
{{- end }}
podDeletion:
force: {{ .Values.driver.upgradePolicy.gpuPodDeletion.force | default false }}
timeoutSeconds: {{ .Values.driver.upgradePolicy.gpuPodDeletion.timeoutSeconds }}
deleteEmptyDir: {{ .Values.driver.upgradePolicy.gpuPodDeletion.deleteEmptyDir | default false }}
drain:
enable: {{ .Values.driver.upgradePolicy.drain.enable | default false }}
force: {{ .Values.driver.upgradePolicy.drain.force | default false }}
{{- if .Values.driver.upgradePolicy.drain.podSelector }}
podSelector: {{ .Values.driver.upgradePolicy.drain.podSelector }}
{{- end }}
timeoutSeconds: {{ .Values.driver.upgradePolicy.drain.timeoutSeconds }}
deleteEmptyDir: {{ .Values.driver.upgradePolicy.drain.deleteEmptyDir | default false}}
{{- end }}
vgpuManager:
enabled: {{ .Values.vgpuManager.enabled }}
{{- if .Values.vgpuManager.repository }}
repository: {{ .Values.vgpuManager.repository }}
{{- end }}
{{- if .Values.vgpuManager.image }}
image: {{ .Values.vgpuManager.image }}
{{- end }}
{{- if .Values.vgpuManager.version }}
version: {{ .Values.vgpuManager.version | quote }}
{{- end }}
{{- if .Values.vgpuManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.vgpuManager.imagePullPolicy }}
{{- end }}
{{- if .Values.vgpuManager.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.vgpuManager.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.vgpuManager.resources }}
resources: {{ toYaml .Values.vgpuManager.resources | nindent 6 }}
{{- end }}
{{- if .Values.vgpuManager.env }}
env: {{ toYaml .Values.vgpuManager.env | nindent 6 }}
{{- end }}
{{- if .Values.vgpuManager.args }}
args: {{ toYaml .Values.vgpuManager.args | nindent 6 }}
{{- end }}
driverManager:
{{- if .Values.vgpuManager.driverManager.repository }}
repository: {{ .Values.vgpuManager.driverManager.repository }}
{{- end }}
{{- if .Values.vgpuManager.driverManager.image }}
image: {{ .Values.vgpuManager.driverManager.image }}
{{- end }}
{{- if .Values.vgpuManager.driverManager.version }}
version: {{ .Values.vgpuManager.driverManager.version | quote }}
{{- end }}
{{- if .Values.vgpuManager.driverManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.vgpuManager.driverManager.imagePullPolicy }}
{{- end }}
{{- if .Values.vgpuManager.driverManager.env }}
env: {{ toYaml .Values.vgpuManager.driverManager.env | nindent 8 }}
{{- end }}
kataManager:
enabled: {{ .Values.kataManager.enabled }}
config: {{ toYaml .Values.kataManager.config | nindent 6 }}
{{- if .Values.kataManager.repository }}
repository: {{ .Values.kataManager.repository }}
{{- end }}
{{- if .Values.kataManager.image }}
image: {{ .Values.kataManager.image }}
{{- end }}
{{- if .Values.kataManager.version }}
version: {{ .Values.kataManager.version | quote }}
{{- end }}
{{- if .Values.kataManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.kataManager.imagePullPolicy }}
{{- end }}
{{- if .Values.kataManager.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.kataManager.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.kataManager.resources }}
resources: {{ toYaml .Values.kataManager.resources | nindent 6 }}
{{- end }}
{{- if .Values.kataManager.env }}
env: {{ toYaml .Values.kataManager.env | nindent 6 }}
{{- end }}
{{- if .Values.kataManager.args }}
args: {{ toYaml .Values.kataManager.args | nindent 6 }}
{{- end }}
vfioManager:
enabled: {{ .Values.vfioManager.enabled }}
{{- if .Values.vfioManager.repository }}
repository: {{ .Values.vfioManager.repository }}
{{- end }}
{{- if .Values.vfioManager.image }}
image: {{ .Values.vfioManager.image }}
{{- end }}
{{- if .Values.vfioManager.version }}
version: {{ .Values.vfioManager.version | quote }}
{{- end }}
{{- if .Values.vfioManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.vfioManager.imagePullPolicy }}
{{- end }}
{{- if .Values.vfioManager.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.vfioManager.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.vfioManager.resources }}
resources: {{ toYaml .Values.vfioManager.resources | nindent 6 }}
{{- end }}
{{- if .Values.vfioManager.env }}
env: {{ toYaml .Values.vfioManager.env | nindent 6 }}
{{- end }}
{{- if .Values.vfioManager.args }}
args: {{ toYaml .Values.vfioManager.args | nindent 6 }}
{{- end }}
driverManager:
{{- if .Values.vfioManager.driverManager.repository }}
repository: {{ .Values.vfioManager.driverManager.repository }}
{{- end }}
{{- if .Values.vfioManager.driverManager.image }}
image: {{ .Values.vfioManager.driverManager.image }}
{{- end }}
{{- if .Values.vfioManager.driverManager.version }}
version: {{ .Values.vfioManager.driverManager.version | quote }}
{{- end }}
{{- if .Values.vfioManager.driverManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.vfioManager.driverManager.imagePullPolicy }}
{{- end }}
{{- if .Values.vfioManager.driverManager.env }}
env: {{ toYaml .Values.vfioManager.driverManager.env | nindent 8 }}
{{- end }}
vgpuDeviceManager:
enabled: {{ .Values.vgpuDeviceManager.enabled }}
{{- if .Values.vgpuDeviceManager.repository }}
repository: {{ .Values.vgpuDeviceManager.repository }}
{{- end }}
{{- if .Values.vgpuDeviceManager.image }}
image: {{ .Values.vgpuDeviceManager.image }}
{{- end }}
{{- if .Values.vgpuDeviceManager.version }}
version: {{ .Values.vgpuDeviceManager.version | quote }}
{{- end }}
{{- if .Values.vgpuDeviceManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.vgpuDeviceManager.imagePullPolicy }}
{{- end }}
{{- if .Values.vgpuDeviceManager.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.vgpuDeviceManager.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.vgpuDeviceManager.resources }}
resources: {{ toYaml .Values.vgpuDeviceManager.resources | nindent 6 }}
{{- end }}
{{- if .Values.vgpuDeviceManager.env }}
env: {{ toYaml .Values.vgpuDeviceManager.env | nindent 6 }}
{{- end }}
{{- if .Values.vgpuDeviceManager.args }}
args: {{ toYaml .Values.vgpuDeviceManager.args | nindent 6 }}
{{- end }}
{{- if .Values.vgpuDeviceManager.config }}
config: {{ toYaml .Values.vgpuDeviceManager.config | nindent 6 }}
{{- end }}
ccManager:
enabled: {{ .Values.ccManager.enabled }}
defaultMode: {{ .Values.ccManager.defaultMode | quote }}
{{- if .Values.ccManager.repository }}
repository: {{ .Values.ccManager.repository }}
{{- end }}
{{- if .Values.ccManager.image }}
image: {{ .Values.ccManager.image }}
{{- end }}
{{- if .Values.ccManager.version }}
version: {{ .Values.ccManager.version | quote }}
{{- end }}
{{- if .Values.ccManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.ccManager.imagePullPolicy }}
{{- end }}
{{- if .Values.ccManager.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.ccManager.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.ccManager.resources }}
resources: {{ toYaml .Values.ccManager.resources | nindent 6 }}
{{- end }}
{{- if .Values.ccManager.env }}
env: {{ toYaml .Values.vfioManager.env | nindent 6 }}
{{- end }}
{{- if .Values.ccManager.args }}
args: {{ toYaml .Values.ccManager.args | nindent 6 }}
{{- end }}
toolkit:
enabled: {{ .Values.toolkit.enabled }}
{{- if .Values.toolkit.repository }}
repository: {{ .Values.toolkit.repository }}
{{- end }}
{{- if .Values.toolkit.image }}
image: {{ .Values.toolkit.image }}
{{- end }}
{{- if .Values.toolkit.version }}
version: {{ .Values.toolkit.version | quote }}
{{- end }}
{{- if .Values.toolkit.imagePullPolicy }}
imagePullPolicy: {{ .Values.toolkit.imagePullPolicy }}
{{- end }}
{{- if .Values.toolkit.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.toolkit.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.toolkit.resources }}
resources: {{ toYaml .Values.toolkit.resources | nindent 6 }}
{{- end }}
{{- if .Values.toolkit.env }}
env: {{ toYaml .Values.toolkit.env | nindent 6 }}
{{- end }}
{{- if .Values.toolkit.installDir }}
installDir: {{ .Values.toolkit.installDir }}
{{- end }}
devicePlugin:
enabled: {{ .Values.devicePlugin.enabled }}
{{- if .Values.devicePlugin.repository }}
repository: {{ .Values.devicePlugin.repository }}
{{- end }}
{{- if .Values.devicePlugin.image }}
image: {{ .Values.devicePlugin.image }}
{{- end }}
{{- if .Values.devicePlugin.version }}
version: {{ .Values.devicePlugin.version | quote }}
{{- end }}
{{- if .Values.devicePlugin.imagePullPolicy }}
imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy }}
{{- end }}
{{- if .Values.devicePlugin.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.devicePlugin.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.devicePlugin.resources }}
resources: {{ toYaml .Values.devicePlugin.resources | nindent 6 }}
{{- end }}
{{- if .Values.devicePlugin.env }}
env: {{ toYaml .Values.devicePlugin.env | nindent 6 }}
{{- end }}
{{- if .Values.devicePlugin.args }}
args: {{ toYaml .Values.devicePlugin.args | nindent 6 }}
{{- end }}
{{- if .Values.devicePlugin.config.name }}
config:
name: {{ .Values.devicePlugin.config.name }}
default: {{ .Values.devicePlugin.config.default }}
{{- end }}
dcgm:
enabled: {{ .Values.dcgm.enabled }}
{{- if .Values.dcgm.repository }}
repository: {{ .Values.dcgm.repository }}
{{- end }}
{{- if .Values.dcgm.image }}
image: {{ .Values.dcgm.image }}
{{- end }}
{{- if .Values.dcgm.version }}
version: {{ .Values.dcgm.version | quote }}
{{- end }}
{{- if .Values.dcgm.imagePullPolicy }}
imagePullPolicy: {{ .Values.dcgm.imagePullPolicy }}
{{- end }}
{{- if .Values.dcgm.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.dcgm.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.dcgm.resources }}
resources: {{ toYaml .Values.dcgm.resources | nindent 6 }}
{{- end }}
{{- if .Values.dcgm.env }}
env: {{ toYaml .Values.dcgm.env | nindent 6 }}
{{- end }}
{{- if .Values.dcgm.args }}
args: {{ toYaml .Values.dcgm.args | nindent 6 }}
{{- end }}
dcgmExporter:
enabled: {{ .Values.dcgmExporter.enabled }}
{{- if .Values.dcgmExporter.repository }}
repository: {{ .Values.dcgmExporter.repository }}
{{- end }}
{{- if .Values.dcgmExporter.image }}
image: {{ .Values.dcgmExporter.image }}
{{- end }}
{{- if .Values.dcgmExporter.version }}
version: {{ .Values.dcgmExporter.version | quote }}
{{- end }}
{{- if .Values.dcgmExporter.imagePullPolicy }}
imagePullPolicy: {{ .Values.dcgmExporter.imagePullPolicy }}
{{- end }}
{{- if .Values.dcgmExporter.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.dcgmExporter.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.dcgmExporter.resources }}
resources: {{ toYaml .Values.dcgmExporter.resources | nindent 6 }}
{{- end }}
{{- if .Values.dcgmExporter.env }}
env: {{ toYaml .Values.dcgmExporter.env | nindent 6 }}
{{- end }}
{{- if .Values.dcgmExporter.args }}
args: {{ toYaml .Values.dcgmExporter.args | nindent 6 }}
{{- end }}
{{- if and (.Values.dcgmExporter.config) (.Values.dcgmExporter.config.name) }}
config:
name: {{ .Values.dcgmExporter.config.name }}
{{- end }}
{{- if .Values.dcgmExporter.serviceMonitor }}
serviceMonitor: {{ toYaml .Values.dcgmExporter.serviceMonitor | nindent 6 }}
{{- end }}
gfd:
enabled: {{ .Values.gfd.enabled }}
{{- if .Values.gfd.repository }}
repository: {{ .Values.gfd.repository }}
{{- end }}
{{- if .Values.gfd.image }}
image: {{ .Values.gfd.image }}
{{- end }}
{{- if .Values.gfd.version }}
version: {{ .Values.gfd.version | quote }}
{{- end }}
{{- if .Values.gfd.imagePullPolicy }}
imagePullPolicy: {{ .Values.gfd.imagePullPolicy }}
{{- end }}
{{- if .Values.gfd.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.gfd.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.gfd.resources }}
resources: {{ toYaml .Values.gfd.resources | nindent 6 }}
{{- end }}
{{- if .Values.gfd.env }}
env: {{ toYaml .Values.gfd.env | nindent 6 }}
{{- end }}
{{- if .Values.gfd.args }}
args: {{ toYaml .Values.gfd.args | nindent 6 }}
{{- end }}
migManager:
enabled: {{ .Values.migManager.enabled }}
{{- if .Values.migManager.repository }}
repository: {{ .Values.migManager.repository }}
{{- end }}
{{- if .Values.migManager.image }}
image: {{ .Values.migManager.image }}
{{- end }}
{{- if .Values.migManager.version }}
version: {{ .Values.migManager.version | quote }}
{{- end }}
{{- if .Values.migManager.imagePullPolicy }}
imagePullPolicy: {{ .Values.migManager.imagePullPolicy }}
{{- end }}
{{- if .Values.migManager.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.migManager.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.migManager.resources }}
resources: {{ toYaml .Values.migManager.resources | nindent 6 }}
{{- end }}
{{- if .Values.migManager.env }}
env: {{ toYaml .Values.migManager.env | nindent 6 }}
{{- end }}
{{- if .Values.migManager.args }}
args: {{ toYaml .Values.migManager.args | nindent 6 }}
{{- end }}
{{- if .Values.migManager.config }}
config:
name: {{ .Values.migManager.config.name }}
default: {{ .Values.migManager.config.default }}
{{- end }}
{{- if .Values.migManager.gpuClientsConfig }}
gpuClientsConfig: {{ toYaml .Values.migManager.gpuClientsConfig | nindent 6 }}
{{- end }}
nodeStatusExporter:
enabled: {{ .Values.nodeStatusExporter.enabled }}
{{- if .Values.nodeStatusExporter.repository }}
repository: {{ .Values.nodeStatusExporter.repository }}
{{- end }}
{{- if .Values.nodeStatusExporter.image }}
image: {{ .Values.nodeStatusExporter.image }}
{{- end }}
version: {{ .Values.nodeStatusExporter.version | default .Chart.AppVersion | quote }}
{{- if .Values.nodeStatusExporter.imagePullPolicy }}
imagePullPolicy: {{ .Values.nodeStatusExporter.imagePullPolicy }}
{{- end }}
{{- if .Values.nodeStatusExporter.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.nodeStatusExporter.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.nodeStatusExporter.resources }}
resources: {{ toYaml .Values.nodeStatusExporter.resources | nindent 6 }}
{{- end }}
{{- if .Values.nodeStatusExporter.env }}
env: {{ toYaml .Values.nodeStatusExporter.env | nindent 6 }}
{{- end }}
{{- if .Values.nodeStatusExporter.args }}
args: {{ toYaml .Values.nodeStatusExporter.args | nindent 6 }}
{{- end }}
{{- if .Values.gds.enabled }}
gds:
enabled: {{ .Values.gds.enabled }}
{{- if .Values.gds.repository }}
repository: {{ .Values.gds.repository }}
{{- end }}
{{- if .Values.gds.image }}
image: {{ .Values.gds.image }}
{{- end }}
version: {{ .Values.gds.version | quote }}
{{- if .Values.gds.imagePullPolicy }}
imagePullPolicy: {{ .Values.gds.imagePullPolicy }}
{{- end }}
{{- if .Values.gds.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.gds.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if .Values.gds.env }}
env: {{ toYaml .Values.gds.env | nindent 6 }}
{{- end }}
{{- if .Values.gds.args }}
args: {{ toYaml .Values.gds.args | nindent 6 }}
{{- end }}
{{- end }}
{{- if .Values.gdrcopy }}
gdrcopy:
enabled: {{ .Values.gdrcopy.enabled | default false }}
{{- if .Values.gdrcopy.repository }}
repository: {{ .Values.gdrcopy.repository }}
{{- end }}
{{- if .Values.gdrcopy.image }}
image: {{ .Values.gdrcopy.image }}
{{- end }}
version: {{ .Values.gdrcopy.version | quote }}
{{- if .Values.gdrcopy.imagePullPolicy }}
imagePullPolicy: {{ .Values.gdrcopy.imagePullPolicy }}
{{- end }}
{{- if .Values.gdrcopy.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.gdrcopy.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if .Values.gdrcopy.env }}
env: {{ toYaml .Values.gdrcopy.env | nindent 6 }}
{{- end }}
{{- if .Values.gdrcopy.args }}
args: {{ toYaml .Values.gdrcopy.args | nindent 6 }}
{{- end }}
{{- end }}
sandboxWorkloads:
enabled: {{ .Values.sandboxWorkloads.enabled }}
{{- if .Values.sandboxWorkloads.defaultWorkload }}
defaultWorkload: {{ .Values.sandboxWorkloads.defaultWorkload }}
{{- end }}
sandboxDevicePlugin:
{{- if .Values.sandboxDevicePlugin.enabled }}
enabled: {{ .Values.sandboxDevicePlugin.enabled }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.repository }}
repository: {{ .Values.sandboxDevicePlugin.repository }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.image }}
image: {{ .Values.sandboxDevicePlugin.image }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.version }}
version: {{ .Values.sandboxDevicePlugin.version | quote }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.imagePullPolicy }}
imagePullPolicy: {{ .Values.sandboxDevicePlugin.imagePullPolicy }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.sandboxDevicePlugin.imagePullSecrets | nindent 6 }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.resources }}
resources: {{ toYaml .Values.sandboxDevicePlugin.resources | nindent 6 }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.env }}
env: {{ toYaml .Values.sandboxDevicePlugin.env | nindent 6 }}
{{- end }}
{{- if .Values.sandboxDevicePlugin.args }}
args: {{ toYaml .Values.sandboxDevicePlugin.args | nindent 6 }}
{{- end }}

View file

@ -0,0 +1,146 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-operator
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
rules:
- apiGroups:
- config.openshift.io
resources:
- clusterversions
- proxies
verbs:
- get
- list
- watch
- apiGroups:
- image.openshift.io
resources:
- imagestreams
verbs:
- get
- list
- watch
- apiGroups:
- security.openshift.io
resources:
- securitycontextconstraints
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- use
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterroles
- clusterrolebindings
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- list
- create
- watch
- update
- patch
- apiGroups:
- ""
resources:
- events
- pods
- pods/eviction
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
- watch
- apiGroups:
- nvidia.com
resources:
- clusterpolicies
- clusterpolicies/finalizers
- clusterpolicies/status
- nvidiadrivers
- nvidiadrivers/finalizers
- nvidiadrivers/status
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- deletecollection
- apiGroups:
- scheduling.k8s.io
resources:
- priorityclasses
verbs:
- get
- list
- watch
- create
- apiGroups:
- node.k8s.io
resources:
- runtimeclasses
verbs:
- get
- list
- create
- update
- watch
- delete
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
- watch
- update
- patch
- create
{{- if .Values.operator.cleanupCRD }}
- delete
{{- end }}

View file

@ -0,0 +1,18 @@
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpu-operator
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
subjects:
- kind: ServiceAccount
name: gpu-operator
namespace: {{ $.Release.Namespace }}
- kind: ServiceAccount
name: node-feature-discovery
namespace: {{ $.Release.Namespace }}
roleRef:
kind: ClusterRole
name: gpu-operator
apiGroup: rbac.authorization.k8s.io

View file

@ -0,0 +1,14 @@
{{- if .Values.dcgmExporter.config }}
{{- if and (.Values.dcgmExporter.config.create) (not (empty .Values.dcgmExporter.config.data)) }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.dcgmExporter.config.name }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
data:
dcgm-metrics.csv: |
{{- .Values.dcgmExporter.config.data | nindent 4 }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,10 @@
{{- if and (.Values.migManager.config.create) (not (empty .Values.migManager.config.data)) }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.migManager.config.name }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
data: {{ toYaml .Values.migManager.config.data | nindent 2 }}
{{- end }}

View file

@ -0,0 +1,107 @@
{{- if .Values.nfd.nodefeaturerules }}
apiVersion: nfd.k8s-sigs.io/v1alpha1
kind: NodeFeatureRule
metadata:
name: nvidia-nfd-nodefeaturerules
spec:
rules:
- name: "TDX rule"
labels:
tdx.enabled: "true"
matchFeatures:
- feature: cpu.security
matchExpressions:
tdx.enabled: {op: IsTrue}
- name: "TDX total keys rule"
extendedResources:
tdx.total_keys: "@cpu.security.tdx.total_keys"
matchFeatures:
- feature: cpu.security
matchExpressions:
tdx.enabled: {op: IsTrue}
- name: "SEV-SNP rule"
labels:
sev.snp.enabled: "true"
matchFeatures:
- feature: cpu.security
matchExpressions:
sev.snp.enabled:
op: IsTrue
- name: "SEV-ES rule"
labels:
sev.es.enabled: "true"
matchFeatures:
- feature: cpu.security
matchExpressions:
sev.es.enabled:
op: IsTrue
- name: SEV system capacities
extendedResources:
sev_asids: '@cpu.security.sev.asids'
sev_es: '@cpu.security.sev.encrypted_state_ids'
matchFeatures:
- feature: cpu.security
matchExpressions:
sev.enabled:
op: Exists
- name: "NVIDIA H100"
labels:
"nvidia.com/gpu.H100": "true"
"nvidia.com/gpu.family": "hopper"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["10de"]}
device: {op: In, value: ["2339"]}
- name: "NVIDIA H100 PCIe"
labels:
"nvidia.com/gpu.H100.pcie": "true"
"nvidia.com/gpu.family": "hopper"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["10de"]}
device: {op: In, value: ["2331"]}
- name: "NVIDIA H100 80GB HBM3"
labels:
"nvidia.com/gpu.H100.HBM3": "true"
"nvidia.com/gpu.family": "hopper"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["10de"]}
device: {op: In, value: ["2330"]}
- name: "NVIDIA H800"
labels:
"nvidia.com/gpu.H800": "true"
"nvidia.com/gpu.family": "hopper"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["10de"]}
device: {op: In, value: ["2324"]}
- name: "NVIDIA H800 PCIE"
labels:
"nvidia.com/gpu.H800.pcie": "true"
"nvidia.com/gpu.family": "hopper"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["10de"]}
device: {op: In, value: ["2322"]}
- name: "NVIDIA CC Enabled"
labels:
"nvidia.com/cc.capable": "true"
matchAny: # TDX/SEV + Hopper GPU
- matchFeatures:
- feature: rule.matched
matchExpressions:
nvidia.com/gpu.family: {op: In, value: ["hopper"]}
sev.snp.enabled: {op: IsTrue}
- matchFeatures:
- feature: rule.matched
matchExpressions:
nvidia.com/gpu.family: {op: In, value: ["hopper"]}
tdx.enabled: {op: IsTrue}
{{- end }}

View file

@ -0,0 +1,119 @@
{{- if and .Values.driver.nvidiaDriverCRD.enabled .Values.driver.nvidiaDriverCRD.deployDefaultCR }}
apiVersion: nvidia.com/v1alpha1
kind: NVIDIADriver
metadata:
name: default
spec:
repository: {{ .Values.driver.repository }}
image: {{ .Values.driver.image }}
version: {{ .Values.driver.version }}
useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }}
usePrecompiled: {{ .Values.driver.usePrecompiled }}
driverType: {{ .Values.driver.nvidiaDriverCRD.driverType | default "gpu" }}
{{- if .Values.daemonsets.annotations }}
annotations: {{ toYaml .Values.daemonsets.annotations | nindent 6 }}
{{- end }}
{{- if .Values.daemonsets.labels }}
labels: {{ toYaml .Values.daemonsets.labels | nindent 6 }}
{{- end }}
{{- if .Values.driver.nvidiaDriverCRD.nodeSelector }}
nodeSelector: {{ toYaml .Values.driver.nvidiaDriverCRD.nodeSelector | nindent 6 }}
{{- end }}
{{- if .Values.driver.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.driver.imagePullSecrets | nindent 4 }}
{{- end }}
{{- if .Values.driver.manager }}
manager: {{ toYaml .Values.driver.manager | nindent 4 }}
{{- end }}
{{- if .Values.driver.startupProbe }}
startupProbe: {{ toYaml .Values.driver.startupProbe | nindent 4 }}
{{- end }}
{{- if .Values.driver.livenessProbe }}
livenessProbe: {{ toYaml .Values.driver.livenessProbe | nindent 4 }}
{{- end }}
{{- if .Values.driver.readinessProbe }}
readinessProbe: {{ toYaml .Values.driver.readinessProbe | nindent 4 }}
{{- end }}
rdma:
enabled: {{ .Values.driver.rdma.enabled }}
useHostMofed: {{ .Values.driver.rdma.useHostMofed }}
{{- if .Values.daemonsets.tolerations }}
tolerations: {{ toYaml .Values.daemonsets.tolerations | nindent 6 }}
{{- end }}
{{- if .Values.driver.repoConfig.configMapName }}
repoConfig:
name: {{ .Values.driver.repoConfig.configMapName }}
{{- end }}
{{- if .Values.driver.certConfig.name }}
certConfig:
name: {{ .Values.driver.certConfig.name }}
{{- end }}
{{- if .Values.driver.licensingConfig.configMapName }}
licensingConfig:
name: {{ .Values.driver.licensingConfig.configMapName }}
nlsEnabled: {{ .Values.driver.licensingConfig.nlsEnabled | default true }}
{{- end }}
{{- if .Values.driver.virtualTopology.config }}
virtualTopologyConfig:
name: {{ .Values.driver.virtualTopology.config }}
{{- end }}
{{- if .Values.driver.kernelModuleConfig.name }}
kernelModuleConfig:
name: {{ .Values.driver.kernelModuleConfig.name }}
{{- end }}
{{- if .Values.driver.resources }}
resources: {{ toYaml .Values.driver.resources | nindent 6 }}
{{- end }}
{{- if .Values.driver.env }}
env: {{ toYaml .Values.driver.env | nindent 6 }}
{{- end }}
{{- if .Values.driver.args }}
args: {{ toYaml .Values.driver.args | nindent 6 }}
{{- end }}
{{- if .Values.gds.enabled }}
gds:
enabled: {{ .Values.gds.enabled }}
{{- if .Values.gds.repository }}
repository: {{ .Values.gds.repository }}
{{- end }}
{{- if .Values.gds.image }}
image: {{ .Values.gds.image }}
{{- end }}
version: {{ .Values.gds.version | quote }}
{{- if .Values.gds.imagePullPolicy }}
imagePullPolicy: {{ .Values.gds.imagePullPolicy }}
{{- end }}
{{- if .Values.gds.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.gds.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if .Values.gds.env }}
env: {{ toYaml .Values.gds.env | nindent 6 }}
{{- end }}
{{- if .Values.gds.args }}
args: {{ toYaml .Values.gds.args | nindent 6 }}
{{- end }}
{{- end }}
{{- if .Values.gdrcopy }}
gdrcopy:
enabled: {{ .Values.gdrcopy.enabled | default false }}
{{- if .Values.gdrcopy.repository }}
repository: {{ .Values.gdrcopy.repository }}
{{- end }}
{{- if .Values.gdrcopy.image }}
image: {{ .Values.gdrcopy.image }}
{{- end }}
version: {{ .Values.gdrcopy.version | quote }}
{{- if .Values.gdrcopy.imagePullPolicy }}
imagePullPolicy: {{ .Values.gdrcopy.imagePullPolicy }}
{{- end }}
{{- if .Values.gdrcopy.imagePullSecrets }}
imagePullSecrets: {{ toYaml .Values.gdrcopy.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if .Values.gdrcopy.env }}
env: {{ toYaml .Values.gdrcopy.env | nindent 6 }}
{{- end }}
{{- if .Values.gdrcopy.args }}
args: {{ toYaml .Values.gdrcopy.args | nindent 6 }}
{{- end }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,99 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-operator
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
nvidia.com/gpu-driver-upgrade-drain.skip: "true"
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: "gpu-operator"
app: "gpu-operator"
template:
metadata:
labels:
{{- include "gpu-operator.labels" . | nindent 8 }}
app.kubernetes.io/component: "gpu-operator"
app: "gpu-operator"
nvidia.com/gpu-driver-upgrade-drain.skip: "true"
annotations:
{{- toYaml .Values.operator.annotations | nindent 8 }}
spec:
serviceAccountName: gpu-operator
{{- if .Values.operator.imagePullSecrets }}
imagePullSecrets:
{{- range .Values.operator.imagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
{{- if .Values.operator.priorityClassName }}
priorityClassName: {{ .Values.operator.priorityClassName }}
{{- end }}
containers:
- name: gpu-operator
image: {{ include "gpu-operator.fullimage" . }}
imagePullPolicy: {{ .Values.operator.imagePullPolicy }}
command: ["gpu-operator"]
args:
- --leader-elect
{{- if .Values.operator.logging.develMode }}
- --zap-devel
{{- else }}
{{- if .Values.operator.logging.timeEncoding }}
- --zap-time-encoding={{- .Values.operator.logging.timeEncoding }}
{{- end }}
{{- if .Values.operator.logging.level }}
- --zap-log-level={{- .Values.operator.logging.level }}
{{- end }}
{{- end }}
env:
- name: WATCH_NAMESPACE
value: ""
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: "DRIVER_MANAGER_IMAGE"
value: "{{ include "driver-manager.fullimage" . }}"
volumeMounts:
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
{{- with .Values.operator.resources }}
resources:
{{- toYaml . | nindent 10 }}
{{- end }}
ports:
- name: metrics
containerPort: 8080
volumes:
- name: host-os-release
hostPath:
path: "/etc/os-release"
{{- with .Values.operator.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.operator.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.operator.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}

View file

@ -0,0 +1,11 @@
{{- if and (.Values.devicePlugin.config.create) (not (empty .Values.devicePlugin.config.data)) }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.devicePlugin.config.name }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
data: {{ toYaml .Values.devicePlugin.config.data | nindent 2 }}
{{- end }}

View file

@ -0,0 +1,49 @@
{{- if .Values.platform.openshift }}
apiVersion: security.openshift.io/v1
kind: SecurityContextConstraints
metadata:
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
annotations:
kubernetes.io/description: restricted denies access to all host features and requires
pods to be run with a UID, read-only root filesystem and SELinux context that are
allocated to the namespace. This SCC is more restrictive than the default
restrictive SCC and it is used by default for authenticated users and operators and operands.
name: restricted-readonly
allowHostDirVolumePlugin: false
allowHostIPC: false
allowHostNetwork: false
allowHostPID: false
allowHostPorts: false
allowPrivilegeEscalation: true
allowPrivilegedContainer: false
allowedCapabilities: []
defaultAddCapabilities: []
fsGroup:
type: MustRunAs
groups:
- system:authenticated
priority: 0
readOnlyRootFilesystem: true
requiredDropCapabilities:
- KILL
- MKNOD
- SETUID
- SETGID
runAsUser:
type: MustRunAsRange
seLinuxContext:
type: MustRunAs
supplementalGroups:
type: RunAsAny
users:
- system:serviceaccount:{{ $.Release.Namespace }}:gpu-operator
volumes:
- configMap
- downwardAPI
- emptyDir
- persistentVolumeClaim
- projected
- secret
{{- end }}

View file

@ -0,0 +1,84 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: gpu-operator
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
rules:
- apiGroups:
- rbac.authorization.k8s.io
resources:
- roles
- rolebindings
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- apps
resources:
- controllerrevisions
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- ""
resources:
- configmaps
- endpoints
- pods
- pods/eviction
- secrets
- services
- services/finalizers
- serviceaccounts
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- list
- create
- watch
- update
- delete

View file

@ -0,0 +1,15 @@
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpu-operator
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
subjects:
- kind: ServiceAccount
name: gpu-operator
namespace: {{ $.Release.Namespace }}
roleRef:
kind: Role
name: gpu-operator
apiGroup: rbac.authorization.k8s.io

View file

@ -0,0 +1,7 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-operator
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"

View file

@ -0,0 +1,95 @@
{{- if .Values.operator.upgradeCRD }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-operator-upgrade-crd-hook-sa
annotations:
helm.sh/hook: pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-operator-upgrade-crd-hook-role
annotations:
helm.sh/hook: pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
rules:
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- create
- get
- list
- watch
- patch
- update
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpu-operator-upgrade-crd-hook-binding
annotations:
helm.sh/hook: pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
subjects:
- kind: ServiceAccount
name: gpu-operator-upgrade-crd-hook-sa
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: gpu-operator-upgrade-crd-hook-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-operator-upgrade-crd
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
template:
metadata:
name: gpu-operator-upgrade-crd
labels:
{{- include "gpu-operator.labels" . | nindent 8 }}
app.kubernetes.io/component: "gpu-operator"
spec:
serviceAccountName: gpu-operator-upgrade-crd-hook-sa
{{- if .Values.operator.imagePullSecrets }}
imagePullSecrets:
{{- range .Values.operator.imagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
{{- with .Values.operator.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: upgrade-crd
image: {{ include "gpu-operator.fullimage" . }}
imagePullPolicy: {{ .Values.operator.imagePullPolicy }}
command:
- /bin/sh
- -c
- >
kubectl apply -f /opt/gpu-operator/nvidia.com_clusterpolicies.yaml;
kubectl apply -f /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml;
{{- if .Values.nfd.enabled }}
kubectl apply -f /opt/gpu-operator/nfd-api-crds.yaml;
{{- end }}
restartPolicy: OnFailure
{{- end }}

View file

@ -0,0 +1,15 @@
toolkit:
env:
- name: CONTAINERD_CONFIG
value: "/etc/containerd/config.toml.tmpl"
- name: CONTAINERD_SOCKET
value: "/run/k3s/containerd/containerd.sock"
- name: CONTAINERD_RUNTIME_CLASS
value: "nvidia"
- name: CONTAINERD_SET_AS_DEFAULT
value: "true"
devicePlugin:
config:
name: time-slicing-config-all
default: any

View file

@ -0,0 +1,602 @@
# Default values for gpu-operator.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
platform:
openshift: false
nfd:
enabled: true
nodefeaturerules: false
psa:
enabled: false
cdi:
enabled: false
default: false
sandboxWorkloads:
enabled: false
defaultWorkload: "container"
hostPaths:
# rootFS represents the path to the root filesystem of the host.
# This is used by components that need to interact with the host filesystem
# and as such this must be a chroot-able filesystem.
# Examples include the MIG Manager and Toolkit Container which may need to
# stop, start, or restart systemd services
rootFS: "/"
# driverInstallDir represents the root at which driver files including libraries,
# config files, and executables can be found.
driverInstallDir: "/run/nvidia/driver"
daemonsets:
labels: {}
annotations: {}
priorityClassName: system-node-critical
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands
# note that driver Daemonset is always set with OnDelete to avoid unintended disruptions
updateStrategy: "RollingUpdate"
# configuration for controlling rolling update of GPU Operands
rollingUpdate:
# maximum number of nodes to simultaneously apply pod updates on.
# can be specified either as number or percentage of nodes. Default 1.
maxUnavailable: "1"
validator:
repository: nvcr.io/nvidia/cloud-native
image: gpu-operator-validator
# If version is not specified, then default is to use chart.AppVersion
#version: ""
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
args: []
resources: {}
plugin:
env:
- name: WITH_WORKLOAD
value: "false"
operator:
repository: nvcr.io/nvidia
image: gpu-operator
# If version is not specified, then default is to use chart.AppVersion
#version: ""
imagePullPolicy: IfNotPresent
imagePullSecrets: []
priorityClassName: system-node-critical
defaultRuntime: docker
runtimeClass: nvidia
use_ocp_driver_toolkit: false
# cleanup CRD on chart un-install
cleanupCRD: false
# upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
# to be passed during helm upgrade.
upgradeCRD: true
initContainer:
image: cuda
repository: nvcr.io/nvidia
version: 12.6.3-base-ubi9
imagePullPolicy: IfNotPresent
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "node-role.kubernetes.io/control-plane"
operator: "Equal"
value: ""
effect: "NoSchedule"
annotations:
openshift.io/scc: restricted-readonly
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: "node-role.kubernetes.io/master"
operator: In
values: [""]
- weight: 1
preference:
matchExpressions:
- key: "node-role.kubernetes.io/control-plane"
operator: In
values: [""]
logging:
# Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')
timeEncoding: epoch
# Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity
level: info
# Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)
# Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)
develMode: false
resources:
limits:
cpu: 500m
memory: 350Mi
requests:
cpu: 200m
memory: 100Mi
mig:
strategy: single
driver:
enabled: true
nvidiaDriverCRD:
enabled: false
deployDefaultCR: true
driverType: gpu
nodeSelector: {}
useOpenKernelModules: false
# use pre-compiled packages for NVIDIA driver installation.
# only supported for as a tech-preview feature on ubuntu22.04 kernels.
usePrecompiled: false
repository: nvcr.io/nvidia
image: driver
version: "550.127.08"
imagePullPolicy: IfNotPresent
imagePullSecrets: []
startupProbe:
initialDelaySeconds: 60
periodSeconds: 10
# nvidia-smi can take longer than 30s in some cases
# ensure enough timeout is set
timeoutSeconds: 60
failureThreshold: 120
rdma:
enabled: false
useHostMofed: false
upgradePolicy:
# global switch for automatic upgrade feature
# if set to false all other options are ignored
autoUpgrade: true
# how many nodes can be upgraded in parallel
# 0 means no limit, all nodes will be upgraded in parallel
maxParallelUpgrades: 1
# maximum number of nodes with the driver installed, that can be unavailable during
# the upgrade. Value can be an absolute number (ex: 5) or
# a percentage of total nodes at the start of upgrade (ex:
# 10%). Absolute number is calculated from percentage by rounding
# up. By default, a fixed value of 25% is used.'
maxUnavailable: 25%
# options for waiting on pod(job) completions
waitForCompletion:
timeoutSeconds: 0
podSelector: ""
# options for gpu pod deletion
gpuPodDeletion:
force: false
timeoutSeconds: 300
deleteEmptyDir: false
# options for node drain (`kubectl drain`) before the driver reload
# this is required only if default GPU pod deletions done by the operator
# are not sufficient to re-install the driver
drain:
enable: false
force: false
podSelector: ""
# It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
timeoutSeconds: 300
deleteEmptyDir: false
manager:
image: k8s-driver-manager
repository: nvcr.io/nvidia/cloud-native
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
version: v0.7.0
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
value: "true"
- name: ENABLE_AUTO_DRAIN
value: "false"
- name: DRAIN_USE_FORCE
value: "false"
- name: DRAIN_POD_SELECTOR_LABEL
value: ""
- name: DRAIN_TIMEOUT_SECONDS
value: "0s"
- name: DRAIN_DELETE_EMPTYDIR_DATA
value: "false"
env: []
resources: {}
# Private mirror repository configuration
repoConfig:
configMapName: ""
# custom ssl key/certificate configuration
certConfig:
name: ""
# vGPU licensing configuration
licensingConfig:
configMapName: ""
nlsEnabled: true
# vGPU topology daemon configuration
virtualTopology:
config: ""
# kernel module configuration for NVIDIA driver
kernelModuleConfig:
name: ""
toolkit:
enabled: true
repository: nvcr.io/nvidia/k8s
image: container-toolkit
version: v1.17.3-ubuntu20.04
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
resources: {}
installDir: "/usr/local/nvidia"
devicePlugin:
enabled: true
repository: nvcr.io/nvidia
image: k8s-device-plugin
version: v0.17.0
imagePullPolicy: IfNotPresent
imagePullSecrets: []
args: []
env:
- name: PASS_DEVICE_SPECS
value: "true"
- name: FAIL_ON_INIT_ERROR
value: "true"
- name: DEVICE_LIST_STRATEGY
value: envvar
- name: DEVICE_ID_STRATEGY
value: uuid
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
resources: {}
# Plugin configuration
# Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
# Use "data" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "data" might be:
# config:
# name: device-plugin-config
# create: true
# data:
# default: |-
# version: v1
# flags:
# migStrategy: none
# mig-single: |-
# version: v1
# flags:
# migStrategy: single
# mig-mixed: |-
# version: v1
# flags:
# migStrategy: mixed
config:
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either existing or to create a new one with create=true above)
name: ""
# Default config name within the ConfigMap
default: ""
# Data section for the ConfigMap to create (i.e only applies when create=true)
data: {}
# MPS related configuration for the plugin
mps:
# MPS root path on the host
root: "/run/nvidia/mps"
# standalone dcgm hostengine
dcgm:
# disabled by default to use embedded nv-hostengine by exporter
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: dcgm
version: 3.3.9-1-ubuntu22.04
imagePullPolicy: IfNotPresent
args: []
env: []
resources: {}
dcgmExporter:
enabled: true
repository: nvcr.io/nvidia/k8s
image: dcgm-exporter
version: 3.3.9-3.6.1-ubuntu22.04
imagePullPolicy: IfNotPresent
env:
- name: DCGM_EXPORTER_LISTEN
value: ":9400"
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
- name: DCGM_EXPORTER_COLLECTORS
value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
resources: {}
serviceMonitor:
enabled: false
interval: 15s
honorLabels: false
additionalLabels: {}
relabelings: []
# - source_labels:
# - __meta_kubernetes_pod_node_name
# regex: (.*)
# target_label: instance
# replacement: $1
# action: replace
# DCGM Exporter configuration
# This block is used to configure DCGM Exporter to emit a customized list of metrics.
# Use "name" to either point to an existing ConfigMap or to create a new one with a
# list of configurations (i.e with create=true).
# When pointing to an existing ConfigMap, the ConfigMap must exist in the same namespace as the release.
# The metrics are expected to be listed under a key called `dcgm-metrics.csv`.
# Use "data" to build an integrated ConfigMap from a set of custom metrics as
# part of the chart. An example of some custom metrics are shown below. Note that
# the contents of "data" must be in CSV format and be valid DCGM Exporter metric configurations.
# config:
# name: custom-dcgm-exporter-metrics
# create: true
# data: |-
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message
# Clocks
# DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
# DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
gfd:
enabled: true
repository: nvcr.io/nvidia
image: k8s-device-plugin
version: v0.17.0
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env:
- name: GFD_SLEEP_INTERVAL
value: 60s
- name: GFD_FAIL_ON_INIT_ERROR
value: "true"
resources: {}
migManager:
enabled: true
repository: nvcr.io/nvidia/cloud-native
image: k8s-mig-manager
version: v0.10.0-ubuntu20.04
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env:
- name: WITH_REBOOT
value: "false"
resources: {}
# MIG configuration
# Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
# Use "data" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "data" might be:
# config:
# name: custom-mig-parted-configs
# create: true
# data: |-
# config.yaml: |-
# version: v1
# mig-configs:
# all-disabled:
# - devices: all
# mig-enabled: false
# custom-mig:
# - devices: [0]
# mig-enabled: false
# - devices: [1]
# mig-enabled: true
# mig-devices:
# "1g.10gb": 7
# - devices: [2]
# mig-enabled: true
# mig-devices:
# "2g.20gb": 2
# "3g.40gb": 1
# - devices: [3]
# mig-enabled: true
# mig-devices:
# "3g.40gb": 1
# "4g.40gb": 1
config:
default: "all-disabled"
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either existing or to create a new one with create=true above)
name: ""
# Data section for the ConfigMap to create (i.e only applies when create=true)
data: {}
gpuClientsConfig:
name: ""
nodeStatusExporter:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: gpu-operator-validator
# If version is not specified, then default is to use chart.AppVersion
#version: ""
imagePullPolicy: IfNotPresent
imagePullSecrets: []
resources: {}
gds:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: nvidia-fs
version: "2.20.5"
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
args: []
gdrcopy:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: gdrdrv
version: "v2.4.1-2"
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
args: []
vgpuManager:
enabled: false
repository: ""
image: vgpu-manager
version: ""
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
resources: {}
driverManager:
image: k8s-driver-manager
repository: nvcr.io/nvidia/cloud-native
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
version: v0.7.0
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
value: "false"
- name: ENABLE_AUTO_DRAIN
value: "false"
vgpuDeviceManager:
enabled: true
repository: nvcr.io/nvidia/cloud-native
image: vgpu-device-manager
version: v0.2.8
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
config:
name: ""
default: "default"
vfioManager:
enabled: true
repository: nvcr.io/nvidia
image: cuda
version: 12.6.3-base-ubi9
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
resources: {}
driverManager:
image: k8s-driver-manager
repository: nvcr.io/nvidia/cloud-native
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
version: v0.7.0
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
value: "false"
- name: ENABLE_AUTO_DRAIN
value: "false"
kataManager:
enabled: false
config:
artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
runtimeClasses:
- name: kata-nvidia-gpu
nodeSelector: {}
artifacts:
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
pullSecret: ""
- name: kata-nvidia-gpu-snp
nodeSelector:
"nvidia.com/cc.capable": "true"
artifacts:
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
pullSecret: ""
repository: nvcr.io/nvidia/cloud-native
image: k8s-kata-manager
version: v0.2.2
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
resources: {}
sandboxDevicePlugin:
enabled: true
repository: nvcr.io/nvidia
image: kubevirt-gpu-device-plugin
version: v1.2.10
imagePullPolicy: IfNotPresent
imagePullSecrets: []
args: []
env: []
resources: {}
ccManager:
enabled: false
defaultMode: "off"
repository: nvcr.io/nvidia/cloud-native
image: k8s-cc-manager
version: v0.1.1
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env:
- name: CC_CAPABLE_DEVICE_IDS
value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d"
resources: {}
node-feature-discovery:
enableNodeFeatureApi: true
priorityClassName: system-node-critical
gc:
enable: true
replicaCount: 1
serviceAccount:
name: node-feature-discovery
create: false
worker:
serviceAccount:
name: node-feature-discovery
# disable creation to avoid duplicate serviceaccount creation by master spec below
create: false
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "node-role.kubernetes.io/control-plane"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
config:
sources:
pci:
deviceClassWhitelist:
- "02"
- "0200"
- "0207"
- "0300"
- "0302"
deviceLabelFields:
- vendor
master:
serviceAccount:
name: node-feature-discovery
create: true
config:
extraLabelNs: ["nvidia.com"]
# noPublish: false
# resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"]
# enableTaints: false
# labelWhiteList: "nvidia.com/gpu"

View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: time-slicing-config-all
namespace: gpu-operator
data:
any: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4