mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 01:15:43 +03:00
nvidia: support disabling the nvidia plugin (#8353)
This commit is contained in:
@@ -336,7 +336,11 @@ START:
|
||||
|
||||
// Start fingerprinting
|
||||
fingerprintCh, err := devicePlugin.Fingerprint(i.ctx)
|
||||
if err != nil {
|
||||
if err == device.ErrPluginDisabled {
|
||||
i.logger.Info("fingerprinting failed: plugin is not enabled")
|
||||
i.handleFingerprintError()
|
||||
return
|
||||
} else if err != nil {
|
||||
i.logger.Error("fingerprinting failed", "error", err)
|
||||
i.handleFingerprintError()
|
||||
return
|
||||
|
||||
@@ -28,9 +28,7 @@ const (
|
||||
// notAvailable value is returned to nomad server in case some properties were
|
||||
// undetected by nvml driver
|
||||
notAvailable = "N/A"
|
||||
)
|
||||
|
||||
const (
|
||||
// Nvidia-container-runtime environment variable names
|
||||
NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
)
|
||||
@@ -59,6 +57,10 @@ var (
|
||||
|
||||
// configSpec is the specification of the plugin's configuration
|
||||
configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
|
||||
"enabled": hclspec.NewDefault(
|
||||
hclspec.NewAttr("enabled", "bool", false),
|
||||
hclspec.NewLiteral("true"),
|
||||
),
|
||||
"ignored_gpu_ids": hclspec.NewDefault(
|
||||
hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
|
||||
hclspec.NewLiteral("[]"),
|
||||
@@ -72,12 +74,16 @@ var (
|
||||
|
||||
// Config contains configuration information for the plugin.
|
||||
type Config struct {
|
||||
Enabled bool `codec:"enabled"`
|
||||
IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
|
||||
FingerprintPeriod string `codec:"fingerprint_period"`
|
||||
}
|
||||
|
||||
// NvidiaDevice contains all plugin specific data
|
||||
type NvidiaDevice struct {
|
||||
// enabled indicates whether the plugin should be enabled
|
||||
enabled bool
|
||||
|
||||
// nvmlClient is used to get data from nvidia
|
||||
nvmlClient nvml.NvmlClient
|
||||
|
||||
@@ -133,6 +139,8 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
|
||||
}
|
||||
}
|
||||
|
||||
d.enabled = config.Enabled
|
||||
|
||||
for _, ignoredGPUId := range config.IgnoredGPUIDs {
|
||||
d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
|
||||
}
|
||||
@@ -149,6 +157,10 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
|
||||
// Fingerprint streams detected devices. If device changes are detected or the
|
||||
// devices health changes, messages will be emitted.
|
||||
func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
|
||||
if !d.enabled {
|
||||
return nil, device.ErrPluginDisabled
|
||||
}
|
||||
|
||||
outCh := make(chan *device.FingerprintResponse)
|
||||
go d.fingerprint(ctx, outCh)
|
||||
return outCh, nil
|
||||
@@ -169,6 +181,10 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation
|
||||
if len(deviceIDs) == 0 {
|
||||
return &device.ContainerReservation{}, nil
|
||||
}
|
||||
if !d.enabled {
|
||||
return nil, device.ErrPluginDisabled
|
||||
}
|
||||
|
||||
// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
|
||||
// of race condition
|
||||
//
|
||||
@@ -202,6 +218,10 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation
|
||||
|
||||
// Stats streams statistics for the detected devices.
|
||||
func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) {
|
||||
if !d.enabled {
|
||||
return nil, device.ErrPluginDisabled
|
||||
}
|
||||
|
||||
outCh := make(chan *device.StatsResponse)
|
||||
go d.stats(ctx, outCh, interval)
|
||||
return outCh, nil
|
||||
|
||||
@@ -26,7 +26,7 @@ func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
|
||||
}
|
||||
|
||||
func TestReserve(t *testing.T) {
|
||||
for _, testCase := range []struct {
|
||||
cases := []struct {
|
||||
Name string
|
||||
ExpectedReservation *device.ContainerReservation
|
||||
ExpectedError error
|
||||
@@ -47,7 +47,8 @@ func TestReserve(t *testing.T) {
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
logger: hclog.NewNullLogger(),
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -66,7 +67,8 @@ func TestReserve(t *testing.T) {
|
||||
devices: map[string]struct{}{
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -88,7 +90,8 @@ func TestReserve(t *testing.T) {
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -102,13 +105,36 @@ func TestReserve(t *testing.T) {
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
} {
|
||||
actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs)
|
||||
req := require.New(t)
|
||||
req.Equal(testCase.ExpectedReservation, actualReservation)
|
||||
req.Equal(testCase.ExpectedError, actualError)
|
||||
{
|
||||
Name: "Device is disabled",
|
||||
ExpectedReservation: nil,
|
||||
ExpectedError: device.ErrPluginDisabled,
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID1": {},
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: false,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.Name, func(t *testing.T) {
|
||||
actualReservation, actualError := c.Device.Reserve(c.RequestedIDs)
|
||||
require.Equal(t, c.ExpectedReservation, actualReservation)
|
||||
require.Equal(t, c.ExpectedError, actualError)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,11 @@ const (
|
||||
DeviceTypeGPU = "gpu"
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrPluginDisabled indicates that the device plugin is disabled
|
||||
ErrPluginDisabled = fmt.Errorf("device is not enabled")
|
||||
)
|
||||
|
||||
// DevicePlugin is the interface for a plugin that can expose detected devices
|
||||
// to Nomad and inform it how to mount them.
|
||||
type DevicePlugin interface {
|
||||
|
||||
Reference in New Issue
Block a user