1 Star 0 Fork 51

wangzihao/ascend-device-plugin

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
main.go 9.07 KB
一键复制 编辑 原始数据 按行查看 历史
/* Copyright(C) 2022-2023. Huawei Technologies Co.,Ltd. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package main implements initialization of the startup parameters of the device plugin.
package main
import (
"context"
"flag"
"fmt"
"os"
"huawei.com/npu-exporter/v6/common-utils/hwlog"
"huawei.com/npu-exporter/v6/devmanager"
"Ascend-device-plugin/pkg/common"
"Ascend-device-plugin/pkg/device/deviceswitch"
"Ascend-device-plugin/pkg/server"
)
const (
// socket name
defaultLogPath = "/var/log/mindx-dl/devicePlugin/devicePlugin.log"
// defaultListWatchPeriod is the default listening device state's period
defaultListWatchPeriod = 5
// maxListWatchPeriod is the max listening device state's period
maxListWatchPeriod = 60
// minListWatchPeriod is the min listening device state's period
minListWatchPeriod = 3
maxLogLineLength = 1024
// defaultLinkdownTimeout is the default linkdown timeout duration
defaultLinkdownTimeout = 30
// maxLinkdownTimeout is the max linkdown timeout duration
maxLinkdownTimeout = 30
// minLinkdownTimeout is the min linkdown timeout duration
minLinkdownTimeout = 1
)
var (
fdFlag = flag.Bool("fdFlag", false, "Whether to use fd system to manage device (default false)")
useAscendDocker = flag.Bool("useAscendDocker", true, "Whether to use ascend docker. "+
"This parameter will be deprecated in future versions")
volcanoType = flag.Bool("volcanoType", false,
"Specifies whether to use volcano for scheduling when the chip type is Ascend310 or Ascend910 (default false)")
version = flag.Bool("version", false, "Output version information")
edgeLogFile = flag.String("edgeLogFile", "/var/alog/AtlasEdge_log/devicePlugin.log",
"Log file path in edge scene")
listWatchPeriod = flag.Int("listWatchPeriod", defaultListWatchPeriod,
"Listen and watch device state's period, unit second, range [3, 60]")
autoStowing = flag.Bool("autoStowing", true, "Whether to automatically stow the fixed device")
logLevel = flag.Int("logLevel", 0,
"Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)")
logMaxAge = flag.Int("maxAge", common.MaxAge,
"Maximum number of days for backup run log files, range [7, 700] days")
logFile = flag.String("logFile", defaultLogPath,
"The log file path, if the file size exceeds 20MB, will be rotate")
logMaxBackups = flag.Int("maxBackups", common.MaxBackups,
"Maximum number of backup log files, range is (0, 30]")
presetVirtualDevice = flag.Bool("presetVirtualDevice", true, "Open the static of "+
"computing power splitting function, only support Ascend910 and Ascend310P")
use310PMixedInsert = flag.Bool("use310PMixedInsert", false, "Whether to use mixed insert "+
"ascend310P-V, ascend310P-VPro, ascend310P-IPro card mode")
hotReset = flag.Int("hotReset", -1, "set hot reset mode: -1-close, 0-infer, "+
"1-train-online, 2-train-offline")
shareDevCount = flag.Uint("shareDevCount", 1, "share device function, enable the func by setting "+
"a value greater than 1, range is [1, 100], only support 310B")
linkdownTimeout = flag.Int64("linkdownTimeout", defaultLinkdownTimeout, "linkdown timeout duration, "+
", range [1, 30]")
dealWatchHandler = flag.Bool("dealWatchHandler", false,
"update pod cache when receiving pod informer watch errors")
checkCachedPods = flag.Bool("checkCachedPods", true, "check pods in cache periodically, default true")
)
var (
// BuildName show app name
BuildName string
// BuildVersion show app version
BuildVersion string
// BuildScene show app staring scene
BuildScene string
)
func initLogModule(ctx context.Context) error {
var loggerPath string
loggerPath = *logFile
if *fdFlag {
loggerPath = *edgeLogFile
}
if !common.CheckFileUserSameWithProcess(loggerPath) {
return fmt.Errorf("check log file failed")
}
hwLogConfig := hwlog.LogConfig{
LogFileName: loggerPath,
LogLevel: *logLevel,
MaxBackups: *logMaxBackups,
MaxAge: *logMaxAge,
MaxLineLength: maxLogLineLength,
}
if err := hwlog.InitRunLogger(&hwLogConfig, ctx); err != nil {
fmt.Printf("hwlog init failed, error is %v\n", err)
return err
}
return nil
}
func checkParam() bool {
if *listWatchPeriod < minListWatchPeriod || *listWatchPeriod > maxListWatchPeriod {
hwlog.RunLog.Errorf("list and watch period %d out of range", *listWatchPeriod)
return false
}
if !(*presetVirtualDevice) && !(*volcanoType) {
hwlog.RunLog.Error("presetVirtualDevice is false, volcanoType should be true")
return false
}
if *use310PMixedInsert && *volcanoType {
hwlog.RunLog.Error("use310PMixedInsert is true, volcanoType should be false")
return false
}
if *use310PMixedInsert && *shareDevCount > 1 {
hwlog.RunLog.Error("use310PMixedInsert is true, shareDevCount should be 1")
return false
}
if !(*presetVirtualDevice) && *shareDevCount > 1 {
hwlog.RunLog.Error("presetVirtualDevice is false, shareDevCount should be 1")
return false
}
if *volcanoType && *shareDevCount > 1 {
hwlog.RunLog.Error("volcanoType is true, shareDevCount should be 1")
return false
}
switch *hotReset {
case common.HotResetClose, common.HotResetInfer, common.HotResetTrainOnLine, common.HotResetTrainOffLine:
default:
hwlog.RunLog.Error("hot reset mode param invalid")
return false
}
if BuildScene != common.EdgeScene && BuildScene != common.CenterScene {
hwlog.RunLog.Error("unSupport build scene, only support edge and center")
return false
}
if (*linkdownTimeout) < minLinkdownTimeout || (*linkdownTimeout) > maxLinkdownTimeout {
hwlog.RunLog.Warn("linkdown timeout duration out of range")
return false
}
return checkShareDevCount()
}
func checkShareDevCount() bool {
if *shareDevCount < 1 || *shareDevCount > common.MaxShareDevCount {
hwlog.RunLog.Error("share device function params invalid")
return false
}
return true
}
func main() {
flag.Parse()
if *version {
fmt.Printf("%s version: %s\n", BuildName, BuildVersion)
return
}
ctx, cancel := context.WithCancel(context.Background())
if err := initLogModule(ctx); err != nil {
return
}
if !checkParam() {
return
}
hwlog.RunLog.Infof("ascend device plugin starting and the version is %s", BuildVersion)
hwlog.RunLog.Infof("ascend device plugin starting scene is %s", BuildScene)
setParameters()
hdm, err := InitFunction()
if err != nil {
return
}
setUseAscendDocker()
go hdm.ListenDevice(ctx)
hdm.SignCatch(cancel)
}
// InitFunction init function
func InitFunction() (*server.HwDevManager, error) {
devM, err := devmanager.AutoInit("")
if err != nil {
hwlog.RunLog.Errorf("init devmanager failed, err: %v", err)
return nil, err
}
hdm := server.NewHwDevManager(devM)
if hdm == nil {
hwlog.RunLog.Error("init device manager failed")
return nil, fmt.Errorf("init device manager failed")
}
hwlog.RunLog.Info("init device manager success")
common.ParamOption.EnableSwitchFault = true
if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault {
switchDevMgr := deviceswitch.NewSwitchDevManager()
if err := switchDevMgr.InitSwitchDev(); err != nil {
hwlog.RunLog.Warnf("failed to init switch switch device manager, will not deal with switch fault, "+
"err: %s", err.Error())
common.ParamOption.EnableSwitchFault = false
// will not return err, to ensure dp keep running while switch is not reachable
return hdm, nil
}
hdm.SwitchDevManager = switchDevMgr
}
return hdm, nil
}
func setParameters() {
common.ParamOption = common.Option{
GetFdFlag: *fdFlag,
UseAscendDocker: *useAscendDocker,
UseVolcanoType: *volcanoType,
AutoStowingDevs: *autoStowing,
ListAndWatchPeriod: *listWatchPeriod,
PresetVDevice: *presetVirtualDevice,
Use310PMixedInsert: *use310PMixedInsert,
HotReset: *hotReset,
BuildScene: BuildScene,
ShareCount: *shareDevCount,
LinkdownTimeout: *linkdownTimeout,
DealWatchHandler: *dealWatchHandler,
CheckCachedPods: *checkCachedPods,
}
}
func setUseAscendDocker() {
*useAscendDocker = true
ascendDocker := os.Getenv("ASCEND_DOCKER_RUNTIME")
if ascendDocker != "True" {
*useAscendDocker = false
hwlog.RunLog.Debugf("get ASCEND_DOCKER_RUNTIME from env is: %#v", ascendDocker)
}
if common.ParamOption.Use310PMixedInsert {
*useAscendDocker = false
hwlog.RunLog.Debugf("310P mixed insert mode do not use ascend docker")
}
if len(common.ParamOption.ProductTypes) == 1 && common.ParamOption.ProductTypes[0] == common.Atlas200ISoc {
*useAscendDocker = false
hwlog.RunLog.Debugf("your device-type is: %v", common.Atlas200ISoc)
}
common.ParamOption.UseAscendDocker = *useAscendDocker
hwlog.RunLog.Infof("device-plugin set ascend docker as: %v", *useAscendDocker)
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Go
1
https://gitee.com/zhobb/ascend-device-plugin.git
git@gitee.com:zhobb/ascend-device-plugin.git
zhobb
ascend-device-plugin
ascend-device-plugin
master

搜索帮助