From f96b8d75843ec053d072f64add4e862b70282a07 Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Sat, 22 Feb 2025 11:14:16 -0500 Subject: [PATCH] GPUs != Neuron Devices Signed-off-by: Davanum Srinivas --- go.mod | 5 ++--- go.sum | 12 ++++++++---- .../instance_selector/instance_selector_test.go | 1 + pkg/apis/eksctl.io/v1alpha5/types.go | 3 +++ pkg/apis/eksctl.io/v1alpha5/validation.go | 8 ++++++++ pkg/ctl/cmdutils/configfile.go | 3 +++ pkg/ctl/cmdutils/nodegroup_flags.go | 1 + pkg/ctl/create/cluster.go | 2 +- pkg/ctl/create/nodegroup.go | 2 +- pkg/ctl/get/nodegroup.go | 2 +- pkg/ctl/scale/nodegroup.go | 2 +- pkg/ctl/update/nodegroup.go | 2 +- pkg/ctl/upgrade/nodegroup.go | 2 +- pkg/eks/fakes/fake_instance_selector.go | 2 +- pkg/eks/nodegroup_service.go | 10 ++++++++-- userdocs/src/usage/instance-selector.md | 2 +- 16 files changed, 42 insertions(+), 17 deletions(-) diff --git a/go.mod b/go.mod index 4e83c2ee3e..bfba8977c1 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ go 1.24.0 require ( github.com/Masterminds/semver/v3 v3.3.1 - github.com/aws/amazon-ec2-instance-selector/v2 v2.4.2-0.20231216170552-14d4dfcbaadf + github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2 github.com/aws/aws-sdk-go v1.55.6 github.com/aws/aws-sdk-go-v2 v1.36.2 github.com/aws/aws-sdk-go-v2/config v1.29.7 @@ -253,7 +253,6 @@ require ( github.com/hexops/gotextdiff v1.0.3 // indirect github.com/huandu/xstrings v1.5.0 // indirect github.com/iancoleman/strcase v0.3.0 // indirect - github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jessevdk/go-flags v1.6.1 // indirect github.com/jgautheron/goconst v1.7.1 // indirect @@ -311,7 +310,7 @@ require ( github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/reflow v0.3.0 // indirect - github.com/muesli/termenv v0.15.2 // indirect + github.com/muesli/termenv v0.16.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/nakabonne/nestif v0.3.1 // indirect diff --git a/go.sum b/go.sum index 6b06b42ee8..a43f549447 100644 --- a/go.sum +++ b/go.sum @@ -102,8 +102,10 @@ github.com/ashanbrown/makezero v1.2.0 h1:/2Lp1bypdmK9wDIq7uWBlDF1iMUpIIS4A+pF6C9 github.com/ashanbrown/makezero v1.2.0/go.mod h1:dxlPhHbDMC6N6xICzFBSK+4njQDdK8euNO0qjQMtGY4= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= -github.com/aws/amazon-ec2-instance-selector/v2 v2.4.2-0.20231216170552-14d4dfcbaadf h1:1zems5/6/Fs+1dFsjTZ+oSogVHkfGl1VWuttRXYGx+0= -github.com/aws/amazon-ec2-instance-selector/v2 v2.4.2-0.20231216170552-14d4dfcbaadf/go.mod h1:zsxolOKwtNEvoOPScJy5+Bu8F72LZy7pqVJNhP8tqVE= +github.com/aws/amazon-ec2-instance-selector/v3 v3.1.0 h1:NtSErNyyzyMzV3RXD3HGMTYUHD+XcaHbAMQHFaoU5Y4= +github.com/aws/amazon-ec2-instance-selector/v3 v3.1.0/go.mod h1:S8Yga4m3aMYvvCDWE4DA72hywLmvY/yknG45QiW0l/M= +github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2 h1:vy7b8q6Cwn3j3HgzRBT7N99POtT1g6SuXlID9CI1yp8= +github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2/go.mod h1:RU/lVVsYHNN7Bwr2UmCw5z2aWPcNIHADY49bj082oYM= github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk= github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/aws/aws-sdk-go-v2 v1.36.2 h1:Ub6I4lq/71+tPb/atswvToaLGVMxKZvjYDVOWEExOcU= @@ -555,8 +557,6 @@ github.com/iancoleman/strcase v0.3.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47 github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20240312041847-bd984b5ce465/go.mod h1:gx7rwoVhcfuVKG5uya9Hs3Sxj7EIvldVofAWIUtGouw= github.com/imdario/mergo v0.3.9/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= -github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= -github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jessevdk/go-flags v1.6.1 h1:Cvu5U8UGrLay1rZfv/zP7iLpSHGUZ/Ou68T0iX1bBK4= @@ -728,6 +728,8 @@ github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= +github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= +github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= @@ -907,6 +909,8 @@ github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6g github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= github.com/sahilm/fuzzy v0.1.1 h1:ceu5RHF8DGgoi+/dR5PsECjCDH1BE3Fnmpo7aVXOdRA= github.com/sahilm/fuzzy v0.1.1/go.mod h1:VFvziUEIMCrT6A6tw2RFIXPXXmzXbOsSHF0DOI8ZK9Y= +github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc= +github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= github.com/sanathkr/go-yaml v0.0.0-20170819195128-ed9d249f429b h1:jUK33OXuZP/l6babJtnLo1qsGvq6G9so9KMflGAm4YA= github.com/sanathkr/go-yaml v0.0.0-20170819195128-ed9d249f429b/go.mod h1:8458kAagoME2+LN5//WxE71ysZ3B7r22fdgb7qVmXSY= github.com/sanathkr/yaml v0.0.0-20170819201035-0056894fa522 h1:fOCp11H0yuyAt2wqlbJtbyPzSgaxHTv8uN1pMpkG1t8= diff --git a/integration/tests/instance_selector/instance_selector_test.go b/integration/tests/instance_selector/instance_selector_test.go index dd763461e4..49b6d66e1b 100644 --- a/integration/tests/instance_selector/instance_selector_test.go +++ b/integration/tests/instance_selector/instance_selector_test.go @@ -62,6 +62,7 @@ var _ = Describe("(Integration) [Instance Selector test]", func() { }, "--instance-selector-vcpus=8", "--instance-selector-memory=32", "--instance-selector-gpus=0", + "--instance-selector-neuron-devices=0", ), Entry("with vCPUs and memory", nil, "--instance-selector-vcpus=8", diff --git a/pkg/apis/eksctl.io/v1alpha5/types.go b/pkg/apis/eksctl.io/v1alpha5/types.go index 716c7fa164..9b7a91129c 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types.go +++ b/pkg/apis/eksctl.io/v1alpha5/types.go @@ -1999,6 +1999,9 @@ type InstanceSelector struct { // GPUs specifies the number of GPUs. // It can be set to 0 to select non-GPU instance types. GPUs *int `json:"gpus,omitempty"` + // NeuronDevices specifies the number of Neuron device Accelerators. + // It can be set to 0 to select non-Accelerator instance types. + NeuronDevices *int32 `json:"neuron_devices,omitempty"` // CPU Architecture of the EC2 instance type. // Valid variants are: // `"x86_64"` diff --git a/pkg/apis/eksctl.io/v1alpha5/validation.go b/pkg/apis/eksctl.io/v1alpha5/validation.go index 0dcf40f0cc..9003eea506 100644 --- a/pkg/apis/eksctl.io/v1alpha5/validation.go +++ b/pkg/apis/eksctl.io/v1alpha5/validation.go @@ -62,6 +62,10 @@ var ( GPUDriversWarning = func(amiFamily string) string { return fmt.Sprintf("%s does not ship with NVIDIA GPU drivers installed, hence won't support running GPU-accelerated workloads out of the box", amiFamily) } + + NeuronDeviceDriversWarning = func(amiFamily string) string { + return fmt.Sprintf("%s does not ship with Neuron Devices drivers installed, hence won't support running inference-accelerated workloads out of the box", amiFamily) + } ) var ( @@ -736,6 +740,10 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool (ng.InstanceSelector.GPUs == nil || *ng.InstanceSelector.GPUs != 0) { logger.Warning("instance selector may/will select GPU instance types, " + GPUDriversWarning(ng.AMIFamily)) } + if ng.InstanceSelector != nil && !ng.InstanceSelector.IsZero() && + (ng.InstanceSelector.NeuronDevices == nil || *ng.InstanceSelector.NeuronDevices != 0) { + logger.Warning("instance selector may/will select Neuron Device instance types, " + NeuronDeviceDriversWarning(ng.AMIFamily)) + } } if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && diff --git a/pkg/ctl/cmdutils/configfile.go b/pkg/ctl/cmdutils/configfile.go index d0b082e8aa..1371c81caa 100644 --- a/pkg/ctl/cmdutils/configfile.go +++ b/pkg/ctl/cmdutils/configfile.go @@ -674,6 +674,9 @@ func normalizeBaseNodeGroup(np api.NodePool, cmd *cobra.Command) { if !flags.Changed("instance-selector-gpus") { ng.InstanceSelector.GPUs = nil } + if !flags.Changed("instance-selector-neuron-devices") { + ng.InstanceSelector.NeuronDevices = nil + } if !flags.Changed("enable-ssm") { ng.SSH.EnableSSM = nil } diff --git a/pkg/ctl/cmdutils/nodegroup_flags.go b/pkg/ctl/cmdutils/nodegroup_flags.go index 50f4c751ea..d3d4013fe0 100644 --- a/pkg/ctl/cmdutils/nodegroup_flags.go +++ b/pkg/ctl/cmdutils/nodegroup_flags.go @@ -75,6 +75,7 @@ func AddInstanceSelectorOptions(flagSetGroup *NamedFlagSetGroup, ng *api.NodeGro fs.StringVar(&ng.InstanceSelector.Memory, "instance-selector-memory", "", "4 or 4GiB") fs.StringVar(&ng.InstanceSelector.CPUArchitecture, "instance-selector-cpu-architecture", "", "x86_64, or arm64") ng.InstanceSelector.GPUs = fs.Int("instance-selector-gpus", 0, "an integer value") + ng.InstanceSelector.NeuronDevices = fs.Int32("instance-selector-neuron-devices", 0, "an integer value") }) } diff --git a/pkg/ctl/create/cluster.go b/pkg/ctl/create/cluster.go index b66007762c..56303e1e11 100644 --- a/pkg/ctl/create/cluster.go +++ b/pkg/ctl/create/cluster.go @@ -20,7 +20,7 @@ import ( kubeclient "k8s.io/client-go/kubernetes" clientcmdlatest "k8s.io/client-go/tools/clientcmd/api/latest" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/weaveworks/eksctl/pkg/accessentry" accessentryactions "github.com/weaveworks/eksctl/pkg/actions/accessentry" "github.com/weaveworks/eksctl/pkg/actions/addon" diff --git a/pkg/ctl/create/nodegroup.go b/pkg/ctl/create/nodegroup.go index e0669774f5..91ce6f9ff0 100644 --- a/pkg/ctl/create/nodegroup.go +++ b/pkg/ctl/create/nodegroup.go @@ -5,7 +5,7 @@ import ( "fmt" "io" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/kris-nova/logger" "github.com/pkg/errors" diff --git a/pkg/ctl/get/nodegroup.go b/pkg/ctl/get/nodegroup.go index 4991391af7..566413f8ef 100644 --- a/pkg/ctl/get/nodegroup.go +++ b/pkg/ctl/get/nodegroup.go @@ -8,7 +8,7 @@ import ( "strconv" "time" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/kris-nova/logger" "github.com/spf13/cobra" "github.com/spf13/pflag" diff --git a/pkg/ctl/scale/nodegroup.go b/pkg/ctl/scale/nodegroup.go index 1126480b57..1eb71dd312 100644 --- a/pkg/ctl/scale/nodegroup.go +++ b/pkg/ctl/scale/nodegroup.go @@ -3,7 +3,7 @@ package scale import ( "context" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/spf13/cobra" "github.com/spf13/pflag" diff --git a/pkg/ctl/update/nodegroup.go b/pkg/ctl/update/nodegroup.go index e4e75f4a04..1ffe77dfb5 100644 --- a/pkg/ctl/update/nodegroup.go +++ b/pkg/ctl/update/nodegroup.go @@ -3,7 +3,7 @@ package update import ( "context" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/lithammer/dedent" "github.com/spf13/cobra" diff --git a/pkg/ctl/upgrade/nodegroup.go b/pkg/ctl/upgrade/nodegroup.go index 7bf566cf11..0b85d6e984 100644 --- a/pkg/ctl/upgrade/nodegroup.go +++ b/pkg/ctl/upgrade/nodegroup.go @@ -4,7 +4,7 @@ import ( "context" "time" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/spf13/cobra" "github.com/spf13/pflag" diff --git a/pkg/eks/fakes/fake_instance_selector.go b/pkg/eks/fakes/fake_instance_selector.go index c0fa903bb8..fae581e298 100644 --- a/pkg/eks/fakes/fake_instance_selector.go +++ b/pkg/eks/fakes/fake_instance_selector.go @@ -5,7 +5,7 @@ import ( "context" "sync" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/weaveworks/eksctl/pkg/eks" ) diff --git a/pkg/eks/nodegroup_service.go b/pkg/eks/nodegroup_service.go index b088680893..ff7cf4cd63 100644 --- a/pkg/eks/nodegroup_service.go +++ b/pkg/eks/nodegroup_service.go @@ -7,8 +7,8 @@ import ( "regexp" "strings" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/bytequantity" - "github.com/aws/amazon-ec2-instance-selector/v2/pkg/selector" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/bytequantity" + "github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector" "github.com/aws/aws-sdk-go-v2/aws" "github.com/kris-nova/logger" "github.com/pkg/errors" @@ -203,6 +203,12 @@ func (n *NodeGroupService) expandInstanceSelector(ins *api.InstanceSelector, azs if ins.GPUs != nil { filters.GpusRange = makeRange(*ins.GPUs) } + if ins.NeuronDevices != nil { + filters.InferenceAcceleratorsRange = &selector.Int32RangeFilter{ + LowerBound: *ins.NeuronDevices, + UpperBound: *ins.NeuronDevices, + } + } cpuArch := ins.CPUArchitecture if cpuArch == "" { cpuArch = defaultCPUArch diff --git a/userdocs/src/usage/instance-selector.md b/userdocs/src/usage/instance-selector.md index 0ba096d10e..5f7036f859 100644 --- a/userdocs/src/usage/instance-selector.md +++ b/userdocs/src/usage/instance-selector.md @@ -60,7 +60,7 @@ $ eksctl create cluster -f instance-selector-cluster.yaml The following instance selector CLI options are supported by `eksctl create cluster` and `eksctl create nodegroup`: -`--instance-selector-vcpus`, `--instance-selector-memory`, `--instance-selector-gpus` and `instance-selector-cpu-architecture` +`--instance-selector-vcpus`, `--instance-selector-memory`, `--instance-selector-gpus`, `--instance-selector-neuron-devices` and `instance-selector-cpu-architecture` ???+ note By default, GPU instance types are not filtered out. If you wish to do so (e.g. for cost effectiveness, when your applications don't particularly benefit from GPU-accelerated workloads), please explicitly set `gpus: 0` (via config file) or `--instance-selector-gpus=0` (via CLI flag).