1 Star 0 Fork 0

小应子/k8s-device-plugin

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
nvidia.go 2.16 KB
一键复制 编辑 原始数据 按行查看 历史
Renaud Gaubert 提交于 2018-05-09 16:45 . Add attributes to devices
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
package main
import (
"fmt"
"log"
"strings"
"github.com/NVIDIA/nvidia-docker/src/nvidia"
"github.com/NVIDIA/nvidia-docker/src/nvml"
"golang.org/x/net/context"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
)
func check(err error) {
if err != nil {
log.Panicln("Fatal:", err)
}
}
func getDevices() []*pluginapi.Device {
devices, err := nvidia.LookupDevices()
check(err)
var devs []*pluginapi.Device
for _, d := range devices {
devs = append(devs, &pluginapi.Device{
ID: d.UUID,
Health: pluginapi.Healthy,
Attributes: map[string]string{
resourceName + "-memory": fmt.Sprintf("%d", *d.Memory.Global),
resourceName + "-ECC": fmt.Sprintf("%t", *d.Memory.ECC),
resourceName + "-arch": fmt.Sprintf("%s", *d.Arch),
},
})
}
return devs
}
func deviceExists(devs []*pluginapi.Device, id string) bool {
for _, d := range devs {
if d.ID == id {
return true
}
}
return false
}
func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
eventSet := nvml.NewEventSet()
defer nvml.DeleteEventSet(eventSet)
for _, d := range devs {
err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, d.ID)
if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
log.Printf("Warning: GPU with UUID %s is too old to support healtchecking with error: %s. Marking it unhealthy.", d.ID)
xids <- d
continue
}
if err != nil {
log.Panicln("Fatal:", err)
}
}
for {
select {
case <-ctx.Done():
return
default:
}
e, err := nvml.WaitForEvent(eventSet, 5000)
if err != nil && e.Etype != nvml.XidCriticalError {
continue
}
// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
continue
}
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
for _, d := range devs {
xids <- d
}
continue
}
for _, d := range devs {
if d.ID == *e.UUID {
xids <- d
}
}
}
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wukaiying01/k8s-device-plugin.git
git@gitee.com:wukaiying01/k8s-device-plugin.git
wukaiying01
k8s-device-plugin
k8s-device-plugin
nvidiak8s/v1.9

搜索帮助

0d507c66 1850385 C8b1a773 1850385