mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
nvidia package restructue + build non-linux
This commit is contained in:
@@ -3,8 +3,8 @@ package main
|
||||
import (
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia"
|
||||
"github.com/hashicorp/nomad/plugins"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -9,9 +9,9 @@ import (
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/base"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/shared/hclspec"
|
||||
)
|
||||
|
||||
@@ -73,9 +73,9 @@ type NvidiaDevice struct {
|
||||
// nvmlClient is used to get data from nvidia
|
||||
nvmlClient nvml.NvmlClient
|
||||
|
||||
// nvmlClientInitializationError holds an error retrieved during
|
||||
// initErr holds an error retrieved during
|
||||
// nvmlClient initialization
|
||||
nvmlClientInitializationError error
|
||||
initErr error
|
||||
|
||||
// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
|
||||
ignoredGPUIDs map[string]struct{}
|
||||
@@ -96,17 +96,17 @@ type NvidiaDevice struct {
|
||||
|
||||
// NewNvidiaDevice returns a new nvidia device plugin.
|
||||
func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
|
||||
nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient()
|
||||
nvmlClient, err := nvml.NewNvmlClient()
|
||||
logger := log.Named(pluginName)
|
||||
if nvmlClientInitializationError != nil {
|
||||
logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError)
|
||||
if err != nil && err.Error() != nvml.UnavailableLib.Error() {
|
||||
logger.Error("unable to initialize Nvidia driver", "reason", err)
|
||||
}
|
||||
return &NvidiaDevice{
|
||||
logger: logger,
|
||||
devices: make(map[string]struct{}),
|
||||
ignoredGPUIDs: make(map[string]struct{}),
|
||||
nvmlClient: nvmlClient,
|
||||
nvmlClientInitializationError: nvmlClientInitializationError,
|
||||
logger: logger,
|
||||
devices: make(map[string]struct{}),
|
||||
ignoredGPUIDs: make(map[string]struct{}),
|
||||
nvmlClient: nvmlClient,
|
||||
initErr: err,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,9 +3,8 @@ package nvidia
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
@@ -5,8 +5,8 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -26,11 +26,16 @@ const (
|
||||
func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
|
||||
defer close(devices)
|
||||
|
||||
if d.nvmlClientInitializationError != nil {
|
||||
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError)
|
||||
// write empty fingerprint response to let server know that there are
|
||||
// no working Nvidia GPU units
|
||||
devices <- device.NewFingerprint()
|
||||
if d.initErr != nil {
|
||||
if d.initErr.Error() != nvml.UnavailableLib.Error() {
|
||||
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr)
|
||||
devices <- device.NewFingerprintError(d.initErr)
|
||||
} else {
|
||||
// write empty fingerprint response to let server know that there are
|
||||
// no working Nvidia GPU units
|
||||
devices <- device.NewFingerprint()
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@@ -51,7 +56,6 @@ func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.F
|
||||
// writeFingerprintToChannel makes nvml call and writes response to channel
|
||||
func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
|
||||
fingerprintData, err := d.nvmlClient.GetFingerprintData()
|
||||
|
||||
if err != nil {
|
||||
d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
|
||||
devices <- device.NewFingerprintError(err)
|
||||
@@ -7,9 +7,9 @@ import (
|
||||
"testing"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
@@ -197,8 +197,10 @@ func TestIgnoreFingerprintedDevices(t *testing.T) {
|
||||
ExpectedResult: nil,
|
||||
},
|
||||
} {
|
||||
actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds)
|
||||
require.New(t).Equal(testCase.ExpectedResult, actualResult)
|
||||
t.Run(testCase.Name, func(t *testing.T) {
|
||||
actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds)
|
||||
require.New(t).Equal(testCase.ExpectedResult, actualResult)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -342,12 +344,14 @@ func TestCheckFingerprintUpdates(t *testing.T) {
|
||||
DeviceMapAfterMethodCall: map[string]struct{}{},
|
||||
},
|
||||
} {
|
||||
actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices)
|
||||
req := require.New(t)
|
||||
// check that function returns valid "updated / not updated" state
|
||||
req.Equal(testCase.ExpectedResult, actualResult)
|
||||
// check that function propely updates devices map
|
||||
req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall)
|
||||
t.Run(testCase.Name, func(t *testing.T) {
|
||||
actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices)
|
||||
req := require.New(t)
|
||||
// check that function returns valid "updated / not updated" state
|
||||
req.Equal(testCase.ExpectedResult, actualResult)
|
||||
// check that function propely updates devices map
|
||||
req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -554,8 +558,10 @@ func TestAttributesFromFingerprintDeviceData(t *testing.T) {
|
||||
},
|
||||
},
|
||||
} {
|
||||
actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData)
|
||||
require.New(t).Equal(testCase.ExpectedResult, actualResult)
|
||||
t.Run(testCase.Name, func(t *testing.T) {
|
||||
actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData)
|
||||
require.New(t).Equal(testCase.ExpectedResult, actualResult)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -715,8 +721,10 @@ func TestDeviceGroupFromFingerprintData(t *testing.T) {
|
||||
ExpectedResult: nil,
|
||||
},
|
||||
} {
|
||||
actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes)
|
||||
require.New(t).Equal(testCase.ExpectedResult, actualResult)
|
||||
t.Run(testCase.Name, func(t *testing.T) {
|
||||
actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes)
|
||||
require.New(t).Equal(testCase.ExpectedResult, actualResult)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1070,20 +1078,22 @@ func TestWriteFingerprintToChannel(t *testing.T) {
|
||||
},
|
||||
},
|
||||
} {
|
||||
channel := make(chan *device.FingerprintResponse, 1)
|
||||
testCase.Device.writeFingerprintToChannel(channel)
|
||||
actualResult := <-channel
|
||||
// writeFingerprintToChannel iterates over map keys
|
||||
// and insterts results to an array, so order of elements in output array
|
||||
// may be different
|
||||
// actualResult, expectedResult arrays has to be sorted firsted
|
||||
sort.Slice(actualResult.Devices, func(i, j int) bool {
|
||||
return actualResult.Devices[i].Name < actualResult.Devices[j].Name
|
||||
t.Run(testCase.Name, func(t *testing.T) {
|
||||
channel := make(chan *device.FingerprintResponse, 1)
|
||||
testCase.Device.writeFingerprintToChannel(channel)
|
||||
actualResult := <-channel
|
||||
// writeFingerprintToChannel iterates over map keys
|
||||
// and insterts results to an array, so order of elements in output array
|
||||
// may be different
|
||||
// actualResult, expectedResult arrays has to be sorted firsted
|
||||
sort.Slice(actualResult.Devices, func(i, j int) bool {
|
||||
return actualResult.Devices[i].Name < actualResult.Devices[j].Name
|
||||
})
|
||||
sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool {
|
||||
return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name
|
||||
})
|
||||
require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
|
||||
})
|
||||
sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool {
|
||||
return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name
|
||||
})
|
||||
require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1097,7 +1107,7 @@ func TestFingerprint(t *testing.T) {
|
||||
{
|
||||
Name: "Check that working driver returns valid fingeprint data",
|
||||
Device: &NvidiaDevice{
|
||||
nvmlClientInitializationError: nil,
|
||||
initErr: nil,
|
||||
nvmlClient: &MockNvmlClient{
|
||||
FingerprintResponseReturned: &nvml.FingerprintData{
|
||||
DriverVersion: "1",
|
||||
@@ -1197,9 +1207,9 @@ func TestFingerprint(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "Check that not working driver returns empty fingeprint data",
|
||||
Name: "Check that not working driver returns error fingeprint data",
|
||||
Device: &NvidiaDevice{
|
||||
nvmlClientInitializationError: errors.New(""),
|
||||
initErr: errors.New("foo"),
|
||||
nvmlClient: &MockNvmlClient{
|
||||
FingerprintResponseReturned: &nvml.FingerprintData{
|
||||
DriverVersion: "1",
|
||||
@@ -1230,14 +1240,18 @@ func TestFingerprint(t *testing.T) {
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
},
|
||||
ExpectedWriteToChannel: &device.FingerprintResponse{},
|
||||
ExpectedWriteToChannel: &device.FingerprintResponse{
|
||||
Error: errors.New("foo"),
|
||||
},
|
||||
},
|
||||
} {
|
||||
outCh := make(chan *device.FingerprintResponse)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
go testCase.Device.fingerprint(ctx, outCh)
|
||||
result := <-outCh
|
||||
cancel()
|
||||
require.New(t).Equal(result, testCase.ExpectedWriteToChannel)
|
||||
t.Run(testCase.Name, func(t *testing.T) {
|
||||
outCh := make(chan *device.FingerprintResponse)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
go testCase.Device.fingerprint(ctx, outCh)
|
||||
result := <-outCh
|
||||
cancel()
|
||||
require.New(t).Equal(result, testCase.ExpectedWriteToChannel)
|
||||
})
|
||||
}
|
||||
}
|
||||
33
devices/gpu/nvidia/nvml/driver_default.go
Normal file
33
devices/gpu/nvidia/nvml/driver_default.go
Normal file
@@ -0,0 +1,33 @@
|
||||
// +build !linux
|
||||
|
||||
package nvml
|
||||
|
||||
// Initialize nvml library by locating nvml shared object file and calling ldopen
|
||||
func (n *nvmlDriver) Initialize() error {
|
||||
return UnavailableLib
|
||||
}
|
||||
|
||||
// Shutdown stops any further interaction with nvml
|
||||
func (n *nvmlDriver) Shutdown() error {
|
||||
return UnavailableLib
|
||||
}
|
||||
|
||||
// SystemDriverVersion returns installed driver version
|
||||
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
|
||||
return "", UnavailableLib
|
||||
}
|
||||
|
||||
// DeviceCount reports number of available GPU devices
|
||||
func (n *nvmlDriver) DeviceCount() (uint, error) {
|
||||
return 0, UnavailableLib
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
|
||||
return nil, UnavailableLib
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
|
||||
return nil, nil, UnavailableLib
|
||||
}
|
||||
@@ -4,59 +4,6 @@ import (
|
||||
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
|
||||
)
|
||||
|
||||
// DeviceInfo represents nvml device data
|
||||
// this struct is returned by NvmlDriver DeviceInfoByIndex and
|
||||
// DeviceInfoAndStatusByIndex methods
|
||||
type DeviceInfo struct {
|
||||
// The following fields are guaranteed to be retrieved from nvml
|
||||
UUID string
|
||||
PCIBusID string
|
||||
DisplayState string
|
||||
PersistenceMode string
|
||||
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
Name *string
|
||||
MemoryMiB *uint64
|
||||
PowerW *uint
|
||||
BAR1MiB *uint64
|
||||
PCIBandwidthMBPerS *uint
|
||||
CoresClockMHz *uint
|
||||
MemoryClockMHz *uint
|
||||
}
|
||||
|
||||
// DeviceStatus represents nvml device status
|
||||
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
|
||||
type DeviceStatus struct {
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
PowerUsageW *uint
|
||||
TemperatureC *uint
|
||||
GPUUtilization *uint // %
|
||||
MemoryUtilization *uint // %
|
||||
EncoderUtilization *uint // %
|
||||
DecoderUtilization *uint // %
|
||||
BAR1UsedMiB *uint64
|
||||
UsedMemoryMiB *uint64
|
||||
ECCErrorsL1Cache *uint64
|
||||
ECCErrorsL2Cache *uint64
|
||||
ECCErrorsDevice *uint64
|
||||
}
|
||||
|
||||
// NvmlDriver represents set of methods to query nvml library
|
||||
type NvmlDriver interface {
|
||||
Initialize() error
|
||||
Shutdown() error
|
||||
SystemDriverVersion() (string, error)
|
||||
DeviceCount() (uint, error)
|
||||
DeviceInfoByIndex(uint) (*DeviceInfo, error)
|
||||
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
|
||||
}
|
||||
|
||||
// nvmlDriver implements NvmlDriver
|
||||
// Users are required to call Initialize method before using any other methods
|
||||
type nvmlDriver struct{}
|
||||
|
||||
// Initialize nvml library by locating nvml shared object file and calling ldopen
|
||||
func (n *nvmlDriver) Initialize() error {
|
||||
return nvml.Init()
|
||||
61
devices/gpu/nvidia/nvml/shared.go
Normal file
61
devices/gpu/nvidia/nvml/shared.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package nvml
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
// UnavailableLib is returned when the nvml library could not be loaded.
|
||||
UnavailableLib = errors.New("could not load NVML library")
|
||||
)
|
||||
|
||||
// nvmlDriver implements NvmlDriver
|
||||
// Users are required to call Initialize method before using any other methods
|
||||
type nvmlDriver struct{}
|
||||
|
||||
// NvmlDriver represents set of methods to query nvml library
|
||||
type NvmlDriver interface {
|
||||
Initialize() error
|
||||
Shutdown() error
|
||||
SystemDriverVersion() (string, error)
|
||||
DeviceCount() (uint, error)
|
||||
DeviceInfoByIndex(uint) (*DeviceInfo, error)
|
||||
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
|
||||
}
|
||||
|
||||
// DeviceInfo represents nvml device data
|
||||
// this struct is returned by NvmlDriver DeviceInfoByIndex and
|
||||
// DeviceInfoAndStatusByIndex methods
|
||||
type DeviceInfo struct {
|
||||
// The following fields are guaranteed to be retrieved from nvml
|
||||
UUID string
|
||||
PCIBusID string
|
||||
DisplayState string
|
||||
PersistenceMode string
|
||||
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
Name *string
|
||||
MemoryMiB *uint64
|
||||
PowerW *uint
|
||||
BAR1MiB *uint64
|
||||
PCIBandwidthMBPerS *uint
|
||||
CoresClockMHz *uint
|
||||
MemoryClockMHz *uint
|
||||
}
|
||||
|
||||
// DeviceStatus represents nvml device status
|
||||
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
|
||||
type DeviceStatus struct {
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
PowerUsageW *uint
|
||||
TemperatureC *uint
|
||||
GPUUtilization *uint // %
|
||||
MemoryUtilization *uint // %
|
||||
EncoderUtilization *uint // %
|
||||
DecoderUtilization *uint // %
|
||||
BAR1UsedMiB *uint64
|
||||
UsedMemoryMiB *uint64
|
||||
ECCErrorsL1Cache *uint64
|
||||
ECCErrorsL2Cache *uint64
|
||||
ECCErrorsDevice *uint64
|
||||
}
|
||||
@@ -4,8 +4,8 @@ import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -53,8 +53,12 @@ const (
|
||||
func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) {
|
||||
defer close(stats)
|
||||
|
||||
if d.nvmlClientInitializationError != nil {
|
||||
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError)
|
||||
if d.initErr != nil {
|
||||
if d.initErr.Error() != nvml.UnavailableLib.Error() {
|
||||
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr)
|
||||
stats <- device.NewStatsError(d.initErr)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@@ -7,9 +7,9 @@ import (
|
||||
"time"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
@@ -141,6 +141,13 @@ type StatsResponse struct {
|
||||
Error error
|
||||
}
|
||||
|
||||
// NewStatsError takes an error and returns a stats response
|
||||
func NewStatsError(err error) *StatsResponse {
|
||||
return &StatsResponse{
|
||||
Error: err,
|
||||
}
|
||||
}
|
||||
|
||||
// DeviceGroupStats contains statistics for each device of a particular
|
||||
// device group, identified by the vendor, type and name of the device.
|
||||
type DeviceGroupStats struct {
|
||||
|
||||
Reference in New Issue
Block a user