Skip to content

Commit 233336b

Browse files
authored
fix: Fix GPU instance filtering when --instance-selector-gpus=0 (#8506)
Fix GPU instance filtering when --instance-selector-gpus=0 - Add workaround for AWS API bug where g6f instances incorrectly report GPU count as 0 - Filter out GPU instances using eksctl's local instance types map when GPUs=0 is specified The AWS DescribeInstanceTypes API incorrectly reports GPU count as 0 for g6f family instances (g6f.large, g6f.xlarge, g6f.2xlarge, g6f.4xlarge) even though they have 1 L4 GPU each. This caused the instance selector to include these GPU instances when users specified --instance-selector-gpus=0 (expecting only non-GPU instances). This fix adds an additional filtering step that uses eksctl's accurate local instance types map to remove GPU instances when GPUs=0 is specified, while preserving existing behavior for all other cases.
1 parent ac6854c commit 233336b

2 files changed

Lines changed: 114 additions & 0 deletions

File tree

pkg/eks/nodegroup_service.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/weaveworks/eksctl/pkg/cfn/manager"
2121
"github.com/weaveworks/eksctl/pkg/outposts"
2222
"github.com/weaveworks/eksctl/pkg/ssh"
23+
"github.com/weaveworks/eksctl/pkg/utils/instance"
2324
"github.com/weaveworks/eksctl/pkg/utils/tasks"
2425
)
2526

@@ -240,6 +241,22 @@ func (n *NodeGroupService) expandInstanceSelector(ins *api.InstanceSelector, azs
240241
return nil, errors.New("instance selector criteria matched no instances; consider broadening your criteria so that more instance types are returned")
241242
}
242243

244+
// Workaround for AWS API bug: filter out GPU instances when GPUs=0 is specified
245+
// AWS incorrectly reports GPU count as 0 for some GPU instances (e.g., g6f family)
246+
if ins.GPUs != nil && *ins.GPUs == 0 {
247+
filteredTypes := make([]string, 0, len(instanceTypes))
248+
for _, instanceType := range instanceTypes {
249+
if !instance.IsGPUInstanceType(instanceType) {
250+
filteredTypes = append(filteredTypes, instanceType)
251+
}
252+
}
253+
instanceTypes = filteredTypes
254+
255+
if len(instanceTypes) == 0 {
256+
return nil, errors.New("instance selector criteria matched no non-GPU instances; consider broadening your criteria")
257+
}
258+
}
259+
243260
return instanceTypes, nil
244261
}
245262

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package eks
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
"github.com/aws/amazon-ec2-instance-selector/v3/pkg/selector"
8+
"github.com/stretchr/testify/assert"
9+
10+
api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5"
11+
)
12+
13+
// MockInstanceSelector for testing
14+
type MockInstanceSelector struct {
15+
returnInstances []string
16+
returnError error
17+
}
18+
19+
func (m *MockInstanceSelector) Filter(ctx context.Context, filters selector.Filters) ([]string, error) {
20+
return m.returnInstances, m.returnError
21+
}
22+
23+
func TestExpandInstanceSelector_GPUFiltering(t *testing.T) {
24+
tests := []struct {
25+
name string
26+
instanceSelector *api.InstanceSelector
27+
mockReturnInstances []string
28+
expectedInstances []string
29+
expectError bool
30+
}{
31+
{
32+
name: "filters out GPU instances when GPUs=0",
33+
instanceSelector: &api.InstanceSelector{
34+
VCPUs: 8,
35+
GPUs: newIntPtr(0),
36+
},
37+
mockReturnInstances: []string{"m5.2xlarge", "g6f.2xlarge", "c5.2xlarge", "g4dn.xlarge"},
38+
expectedInstances: []string{"m5.2xlarge", "c5.2xlarge"},
39+
expectError: false,
40+
},
41+
{
42+
name: "includes GPU instances when GPUs=1",
43+
instanceSelector: &api.InstanceSelector{
44+
VCPUs: 8,
45+
GPUs: newIntPtr(1),
46+
},
47+
mockReturnInstances: []string{"m5.2xlarge", "g6f.2xlarge", "c5.2xlarge", "g4dn.xlarge"},
48+
expectedInstances: []string{"m5.2xlarge", "g6f.2xlarge", "c5.2xlarge", "g4dn.xlarge"},
49+
expectError: false,
50+
},
51+
{
52+
name: "no filtering when GPUs is nil",
53+
instanceSelector: &api.InstanceSelector{
54+
VCPUs: 8,
55+
},
56+
mockReturnInstances: []string{"m5.2xlarge", "g6f.2xlarge", "c5.2xlarge"},
57+
expectedInstances: []string{"m5.2xlarge", "g6f.2xlarge", "c5.2xlarge"},
58+
expectError: false,
59+
},
60+
{
61+
name: "error when all instances are filtered out",
62+
instanceSelector: &api.InstanceSelector{
63+
VCPUs: 8,
64+
GPUs: newIntPtr(0),
65+
},
66+
mockReturnInstances: []string{"g6f.2xlarge", "g4dn.xlarge"},
67+
expectedInstances: nil,
68+
expectError: true,
69+
},
70+
}
71+
72+
for _, tt := range tests {
73+
t.Run(tt.name, func(t *testing.T) {
74+
mockSelector := &MockInstanceSelector{
75+
returnInstances: tt.mockReturnInstances,
76+
}
77+
78+
service := &NodeGroupService{
79+
instanceSelector: mockSelector,
80+
}
81+
82+
result, err := service.expandInstanceSelector(tt.instanceSelector, []string{"us-west-2a"})
83+
84+
if tt.expectError {
85+
assert.Error(t, err)
86+
assert.Nil(t, result)
87+
} else {
88+
assert.NoError(t, err)
89+
assert.Equal(t, tt.expectedInstances, result)
90+
}
91+
})
92+
}
93+
}
94+
95+
func newIntPtr(i int) *int {
96+
return &i
97+
}

0 commit comments

Comments
 (0)