Skip to content

Commit cf5d952

Browse files
committed
refactor(nodeadm): PCIe detection for GPU instance types
1 parent 28f65e2 commit cf5d952

File tree

6 files changed

+132
-104
lines changed

6 files changed

+132
-104
lines changed

nodeadm/internal/containerd/config.go

+17-8
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,15 @@ type containerdTemplateVars struct {
3030
RuntimeBinaryName string
3131
}
3232

33-
func writeContainerdConfig(cfg *api.NodeConfig) error {
34-
if err := writeBaseRuntimeSpec(cfg); err != nil {
35-
return err
36-
}
33+
type configurator interface {
34+
// Transform mutates a given containerd template configuration.
35+
Transform(*containerdTemplateVars)
36+
37+
// Matches determines whether the configurator is relevant to the current instance.
38+
Matches(*api.NodeConfig) bool
39+
}
3740

41+
func writeContainerdConfig(cfg *api.NodeConfig) error {
3842
containerdConfig, err := generateContainerdConfig(cfg)
3943
if err != nil {
4044
return err
@@ -59,12 +63,17 @@ func writeContainerdConfig(cfg *api.NodeConfig) error {
5963
}
6064

6165
func generateContainerdConfig(cfg *api.NodeConfig) ([]byte, error) {
62-
instanceOptions := applyInstanceTypeMixins(cfg.Status.Instance.Type)
63-
6466
configVars := containerdTemplateVars{
6567
SandboxImage: cfg.Status.Defaults.SandboxImage,
66-
RuntimeBinaryName: instanceOptions.RuntimeBinaryName,
67-
RuntimeName: instanceOptions.RuntimeName,
68+
RuntimeBinaryName: "/usr/sbin/runc",
69+
RuntimeName: "runc",
70+
}
71+
for _, configurator := range []configurator{
72+
NewNvidiaModifier(),
73+
} {
74+
if configurator.Matches(cfg) {
75+
configurator.Transform(&configVars)
76+
}
6877
}
6978
var buf bytes.Buffer
7079
if err := containerdConfigTemplate.Execute(&buf, configVars); err != nil {

nodeadm/internal/containerd/daemon.go

+3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ func NewContainerdDaemon(daemonManager daemon.DaemonManager) daemon.Daemon {
2020
}
2121

2222
func (cd *containerd) Configure(c *api.NodeConfig) error {
23+
if err := writeBaseRuntimeSpec(c); err != nil {
24+
return err
25+
}
2326
return writeContainerdConfig(c)
2427
}
2528

nodeadm/internal/containerd/nvidia.go

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package containerd
2+
3+
import (
4+
"os"
5+
"slices"
6+
"strings"
7+
8+
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
9+
"go.uber.org/zap"
10+
)
11+
12+
type nvidiaModifier struct {
13+
pcieDevicesPath string
14+
}
15+
16+
func NewNvidiaModifier() *nvidiaModifier {
17+
return &nvidiaModifier{
18+
pcieDevicesPath: "/proc/bus/pci/devices",
19+
}
20+
}
21+
22+
func (m *nvidiaModifier) Matches(cfg *api.NodeConfig) bool {
23+
return m.matchesInstanceType(cfg.Status.Instance.Type) || m.matchesPCIeVendor()
24+
}
25+
26+
func (*nvidiaModifier) Transform(ctrdTemplate *containerdTemplateVars) {
27+
zap.L().Info("Configuring NVIDIA runtime..")
28+
ctrdTemplate.RuntimeName = "nvidia"
29+
ctrdTemplate.RuntimeBinaryName = "/usr/bin/nvidia-container-runtime"
30+
}
31+
32+
var nvidiaInstanceFamilies = []string{
33+
"p3", "p3dn",
34+
"p4d", "p4de",
35+
"p5", "p5e", "p5en",
36+
"g4", "g4dn",
37+
"g5", "g5g",
38+
"g6", "g6e",
39+
}
40+
41+
// TODO: deprecate to avoid manual instance type tracking.
42+
func (*nvidiaModifier) matchesInstanceType(instanceType string) bool {
43+
family := strings.Split(instanceType, ".")[0]
44+
return slices.Contains(nvidiaInstanceFamilies, family)
45+
}
46+
47+
func (m *nvidiaModifier) matchesPCIeVendor() bool {
48+
devices, err := os.ReadFile(m.pcieDevicesPath)
49+
if err != nil {
50+
zap.L().Error("Failed to read PCIe devices", zap.Error(err))
51+
return false
52+
}
53+
// The contents of '/proc/bus/pci/devices' looks like the following, where
54+
// the last column contains the vendor name if present.
55+
//
56+
// something like the following:
57+
//
58+
// 0018 1d0f1111 0 c1000008 0 0 0 0 0 c0002 400000 0 0 0 0 0 20000
59+
// 0020 1d0f8061 b c1508000 0 0 0 0 0 0 4000 0 0 0 0 0 0 nvme
60+
// 0028 1d0fec20 0 c1504000 0 c1400008 0 0 0 0 4000 0 100000 0 0 0 0 ena
61+
// 00f0 10de1eb8 a c0000000 44000000c 0 45000000c 0 0 0 1000000 10000000 0 2000000 0 0 0 nvidia
62+
// 00f8 1d0fcd01 0 c1500000 0 c150c008 0 0 0 0 4000 0 2000 0 0 0 0 nvme
63+
// 0030 1d0fec20 0 c1510000 0 c1600008 0 0 0 0 4000 0 100000 0 0 0 0 ena
64+
return strings.Contains(string(devices), "nvidia")
65+
}
+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package containerd
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
9+
"github.com/stretchr/testify/assert"
10+
)
11+
12+
func TestNvidiaConfigurator(t *testing.T) {
13+
14+
t.Run("IsNvidiaUsingInstanceType", func(t *testing.T) {
15+
configurator := nvidiaModifier{}
16+
template := containerdTemplateVars{}
17+
assert.True(t, configurator.Matches(nvidiaInstanceTypeNodeConfig("p5.48xlarge")))
18+
configurator.Transform(&template)
19+
assert.Equal(t, "nvidia", template.RuntimeName)
20+
assert.Equal(t, "/usr/bin/nvidia-container-runtime", template.RuntimeBinaryName)
21+
})
22+
23+
t.Run("IsNvidiaUsingPCIe", func(t *testing.T) {
24+
configurator := nvidiaModifier{pcieDevicesPath: filepath.Join(t.TempDir(), "pcie-devices")}
25+
os.WriteFile(configurator.pcieDevicesPath, []byte("nvidia"), 0777)
26+
template := containerdTemplateVars{}
27+
assert.True(t, configurator.Matches(nvidiaInstanceTypeNodeConfig("xxx.xxxxx")))
28+
configurator.Transform(&template)
29+
assert.Equal(t, "nvidia", template.RuntimeName)
30+
assert.Equal(t, "/usr/bin/nvidia-container-runtime", template.RuntimeBinaryName)
31+
})
32+
33+
t.Run("IsNotNvidia", func(t *testing.T) {
34+
configurator := nvidiaModifier{}
35+
assert.False(t, configurator.Matches(nvidiaInstanceTypeNodeConfig("m5.large")))
36+
})
37+
}
38+
39+
func nvidiaInstanceTypeNodeConfig(instanceType string) *api.NodeConfig {
40+
return &api.NodeConfig{
41+
Status: api.NodeConfigStatus{
42+
Instance: api.InstanceDetails{
43+
Type: instanceType,
44+
},
45+
},
46+
}
47+
}

nodeadm/internal/containerd/runtime_config.go

-65
This file was deleted.

nodeadm/internal/containerd/runtime_config_test.go

-31
This file was deleted.

0 commit comments

Comments
 (0)