1 Star 0 Fork 10

cuiyili/eggo

forked from src-openEuler/eggo 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
0007-support-rollback-of-create-cluster.patch 17.48 KB
一键复制 编辑 原始数据 按行查看 历史
zhangxiaoyu 提交于 2021-08-03 15:28 . upgrage to v0.9.1-1
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
From 9fd05bda9c5789bee45de142468fec0227929927 Mon Sep 17 00:00:00 2001
From: haozi007 <liuhao27@huawei.com>
Date: Thu, 29 Jul 2021 12:04:58 +0100
Subject: [PATCH 7/9] support rollback of create cluster
1. rollback if failed of create cluster;
2. support partial success of create cluster;
Signed-off-by: haozi007 <liuhao27@huawei.com>
---
cmd/deploy.go | 66 +++++-
pkg/api/tools.go | 41 ++++
pkg/api/types.go | 7 +
pkg/clusterdeployment/binary/binary.go | 3 +-
.../binary/cleanupcluster/cleanupnode.go | 4 +-
pkg/clusterdeployment/clusterdeploy.go | 208 +++++++++++++-----
pkg/utils/nodemanager/node.go | 2 +-
pkg/utils/nodemanager/nodemanager.go | 27 ++-
pkg/utils/utils.go | 4 +
9 files changed, 292 insertions(+), 70 deletions(-)
diff --git a/cmd/deploy.go b/cmd/deploy.go
index 3aba440..12fdd8e 100644
--- a/cmd/deploy.go
+++ b/cmd/deploy.go
@@ -24,8 +24,64 @@ import (
"isula.org/eggo/pkg/clusterdeployment"
)
-func deploy(ccfg *api.ClusterConfig) error {
- return clusterdeployment.CreateCluster(ccfg)
+func deploy(conf *deployConfig) error {
+ if err := saveDeployConfig(conf, savedDeployConfigPath(conf.ClusterID)); err != nil {
+ return fmt.Errorf("save deploy config failed: %v", err)
+ }
+
+ ccfg := toClusterdeploymentConfig(conf)
+
+ cstatus, err := clusterdeployment.CreateCluster(ccfg)
+ if err != nil {
+ return err
+ }
+
+ if cstatus.FailureCnt > 0 {
+ // if partial success, just update config of cluster, remove failed nodes
+ var tmp []*HostConfig
+ for _, n := range conf.Masters {
+ if success, ok := cstatus.StatusOfNodes[n.Ip]; ok && !success {
+ continue
+ }
+ tmp = append(tmp, n)
+ }
+ conf.Masters = tmp
+
+ tmp = nil
+ for _, n := range conf.Workers {
+ if success, ok := cstatus.StatusOfNodes[n.Ip]; ok && !success {
+ continue
+ }
+ tmp = append(tmp, n)
+ }
+ conf.Workers = tmp
+
+ tmp = nil
+ for _, n := range conf.Etcds {
+ if success, ok := cstatus.StatusOfNodes[n.Ip]; ok && !success {
+ continue
+ }
+ tmp = append(tmp, n)
+ }
+ conf.Etcds = tmp
+
+ err = saveDeployConfig(conf, savedDeployConfigPath(conf.ClusterID))
+ if err != nil {
+ fmt.Printf("")
+ clusterdeployment.RemoveCluster(ccfg)
+ return fmt.Errorf("update config of cluster failed: %v", err)
+ }
+ fmt.Printf("update config of cluster: %s", conf.ClusterID)
+ }
+
+ fmt.Print(cstatus.Show())
+
+ if cstatus.Working {
+ fmt.Printf("To start using cluster: %s, you need following as a regular user:\n\n", ccfg.Name)
+ fmt.Printf("\texport KUBECONFIG=%s/admin.conf\n\n", api.GetClusterHomePath(ccfg.Name))
+ }
+
+ return err
}
func deployCluster(cmd *cobra.Command, args []string) error {
@@ -40,11 +96,7 @@ func deployCluster(cmd *cobra.Command, args []string) error {
// TODO: make sure config valid
- if err := saveDeployConfig(conf, savedDeployConfigPath(conf.ClusterID)); err != nil {
- return fmt.Errorf("save deploy config failed: %v", err)
- }
-
- if err := deploy(toClusterdeploymentConfig(conf)); err != nil {
+ if err := deploy(conf); err != nil {
return err
}
diff --git a/pkg/api/tools.go b/pkg/api/tools.go
index a73b9ea..2e4fb4c 100644
--- a/pkg/api/tools.go
+++ b/pkg/api/tools.go
@@ -7,6 +7,7 @@ import (
"github.com/sirupsen/logrus"
"isula.org/eggo/pkg/constants"
+ "k8s.io/apimachinery/pkg/util/json"
)
var (
@@ -80,3 +81,43 @@ func GetEtcdServers(ecc *EtcdClusterConfig) string {
func IsCleanupSchedule(schedule Schedule) bool {
return schedule == SchedulePreCleanup || schedule == SchedulePostCleanup
}
+
+func (hc HostConfig) DeepCopy() (*HostConfig, error) {
+ b, err := json.Marshal(hc)
+ if err != nil {
+ return nil, err
+ }
+ var result HostConfig
+ err = json.Unmarshal(b, &result)
+ return &result, err
+}
+
+func (cs *ClusterStatus) Show() string {
+ var sb strings.Builder
+ var fb strings.Builder
+
+ sb.WriteString("-------------------------------\n")
+ sb.WriteString("message: ")
+ sb.WriteString(cs.Message)
+ sb.WriteString("\nsummary: \n")
+ if cs.Working {
+ sb.WriteString(cs.ControlPlane)
+ sb.WriteString("\t\tsuccess")
+ sb.WriteString("\n")
+ }
+ for ip, ok := range cs.StatusOfNodes {
+ if ok {
+ sb.WriteString(ip)
+ sb.WriteString("\t\tsuccess")
+ sb.WriteString("\n")
+ } else {
+ fb.WriteString(ip)
+ fb.WriteString("\t\tfailure")
+ fb.WriteString("\n")
+ }
+ }
+ sb.WriteString(fb.String())
+ sb.WriteString("-------------------------------\n")
+
+ return sb.String()
+}
diff --git a/pkg/api/types.go b/pkg/api/types.go
index 71f4f14..86da853 100644
--- a/pkg/api/types.go
+++ b/pkg/api/types.go
@@ -215,7 +215,14 @@ type ClusterConfig struct {
}
type ClusterStatus struct {
+ Message string `json:"message"`
+ ControlPlane string `json:"controlplane"`
+ Working bool `json:"working"`
+ StatusOfNodes map[string]bool `json:"statusOfNodes"`
+ SuccessCnt uint32 `json:"successCnt"`
+ FailureCnt uint32 `json:"failureCnt"`
}
+
type InfrastructureAPI interface {
// TODO: should add other dependence cluster configurations
MachineInfraSetup(machine *HostConfig) error
diff --git a/pkg/clusterdeployment/binary/binary.go b/pkg/clusterdeployment/binary/binary.go
index cc6a6a0..dd260b0 100644
--- a/pkg/clusterdeployment/binary/binary.go
+++ b/pkg/clusterdeployment/binary/binary.go
@@ -533,7 +533,8 @@ func (bcp *BinaryClusterDeployment) PostNodeJoinHooks(node *api.HostConfig) erro
}
}
- if utils.IsType(roles, (api.Master & api.Worker)) {
+ // check whether the node is worker and master
+ if utils.IsType(roles, (api.Master | api.Worker)) {
if err := taintAndLabelNode(bcp.config.Name, node.Name); err != nil {
return err
}
diff --git a/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go b/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go
index 76cd7ce..da0488a 100644
--- a/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go
+++ b/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go
@@ -232,7 +232,6 @@ func execRemoveWorkerTask(conf *api.ClusterConfig, hostconfig *api.HostConfig) e
task.SetIgnoreErrorFlag(taskRemoveWorker)
if err := nodemanager.RunTaskOnNodes(taskRemoveWorker, []string{master}); err != nil {
- logrus.Errorf("run task for remove worker failed: %v", err)
return err
}
@@ -246,8 +245,7 @@ func CleanupNode(conf *api.ClusterConfig, hostconfig *api.HostConfig, delType ui
if utils.IsType(delType, api.Worker) {
if err := execRemoveWorkerTask(conf, hostconfig); err != nil {
- logrus.Errorf("remove workers failed: %v", err)
- return err
+ logrus.Warnf("ignore: remove workers failed: %v", err)
}
}
diff --git a/pkg/clusterdeployment/clusterdeploy.go b/pkg/clusterdeployment/clusterdeploy.go
index 016da2b..c4e0880 100644
--- a/pkg/clusterdeployment/clusterdeploy.go
+++ b/pkg/clusterdeployment/clusterdeploy.go
@@ -28,121 +28,215 @@ import (
"isula.org/eggo/pkg/utils/nodemanager"
)
-func doCreateCluster(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig) error {
- var loadbalancer *api.HostConfig
- var controlPlane *api.HostConfig
- var joinNodes []*api.HostConfig
- var joinNodeIDs []string
+func splitNodes(nodes []*api.HostConfig) (*api.HostConfig, []*api.HostConfig, []*api.HostConfig, []string) {
+ var lb *api.HostConfig
+ var masters []*api.HostConfig
+ var workers []*api.HostConfig
var etcdNodes []string
- // Step1: setup infrastructure for all nodes in the cluster
- for _, n := range cc.Nodes {
- if err := handler.MachineInfraSetup(n); err != nil {
- return err
- }
+
+ for _, n := range nodes {
if utils.IsType(n.Type, api.LoadBalance) {
- loadbalancer = n
+ lb = n
}
if utils.IsType(n.Type, api.ETCD) {
etcdNodes = append(etcdNodes, n.Address)
}
+
+ if utils.IsType(n.Type, api.Master) {
+ masters = append(masters, n)
+ // node with master and worker, just put into masters
+ continue
+ }
+
if utils.IsType(n.Type, api.Worker) {
- joinNodes = append(joinNodes, n)
- joinNodeIDs = append(joinNodeIDs, n.Address)
+ workers = append(workers, n)
}
+ }
- if utils.IsType(n.Type, api.Master) {
- if controlPlane == nil {
- controlPlane = n
- } else {
- joinNodes = append(joinNodes, n)
- joinNodeIDs = append(joinNodeIDs, n.Address)
- }
+ return lb, masters, workers, etcdNodes
+}
+
+func doJoinNodeOfCluster(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig, masters, workers []*api.HostConfig) ([]string, []*api.HostConfig, error) {
+ var joinedNodeIDs []string
+ var failedNodes []*api.HostConfig
+ for _, node := range workers {
+ if err := handler.ClusterNodeJoin(node); err != nil {
+ failedNodes = append(failedNodes, node)
+ continue
+ }
+ joinedNodeIDs = append(joinedNodeIDs, node.Address)
+ }
+ for _, node := range masters {
+ if err := handler.ClusterNodeJoin(node); err != nil {
+ failedNodes = append(failedNodes, node)
+ continue
+ }
+ joinedNodeIDs = append(joinedNodeIDs, node.Address)
+ }
+ // wait all nodes ready
+ if err := nodemanager.WaitNodesFinishWithProgress(joinedNodeIDs, time.Minute*5); err != nil {
+ tFailedNodes, successNodes := nodemanager.CheckNodesStatus(joinedNodeIDs)
+ // update joined and failed nodes
+ failedNodes = append(failedNodes, tFailedNodes...)
+ joinedNodeIDs = successNodes
+ if len(successNodes) == 0 {
+ return joinedNodeIDs, nil, err
+ }
+ logrus.Warnf("wait some node to complete join failed: %v", err)
+ }
+
+ return joinedNodeIDs, failedNodes, nil
+}
+
+func doCreateCluster(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig, cstatus *api.ClusterStatus) ([]*api.HostConfig, error) {
+ loadbalancer, masters, workers, etcdNodes := splitNodes(cc.Nodes)
+
+ if len(masters) == 0 {
+ return nil, fmt.Errorf("no master found")
+ }
+ controlPlaneNode, err := masters[0].DeepCopy()
+ if err != nil {
+ return nil, err
+ }
+ cstatus.ControlPlane = controlPlaneNode.Address
+ masters = masters[1:]
+
+ // Step1: setup infrastructure for all nodes in the cluster
+ for _, n := range cc.Nodes {
+ if err = handler.MachineInfraSetup(n); err != nil {
+ return nil, err
}
}
// Step2: run precreate cluster hooks
- if err := handler.PreCreateClusterHooks(); err != nil {
- return err
+ if err = handler.PreCreateClusterHooks(); err != nil {
+ return nil, err
}
// Step3: setup etcd cluster
// wait infrastructure task success on nodes of etcd cluster
- if err := nodemanager.WaitNodesFinishWithProgress(etcdNodes, time.Minute*5); err != nil {
- return err
+ if err = nodemanager.WaitNodesFinishWithProgress(etcdNodes, time.Minute*5); err != nil {
+ return nil, err
}
- if err := handler.EtcdClusterSetup(); err != nil {
- return err
+ if err = handler.EtcdClusterSetup(); err != nil {
+ return nil, err
}
+
// Step4: setup loadbalance for cluster
- if err := handler.LoadBalancerSetup(loadbalancer); err != nil {
- return err
+ if err = handler.LoadBalancerSetup(loadbalancer); err != nil {
+ return nil, err
}
+
// Step5: setup control plane for cluster
- if err := handler.ClusterControlPlaneInit(controlPlane); err != nil {
- return err
+ if err = handler.ClusterControlPlaneInit(controlPlaneNode); err != nil {
+ return nil, err
}
// wait controlplane setup task success
- if err := nodemanager.WaitNodesFinish([]string{controlPlane.Address}, time.Minute*5); err != nil {
- return err
+ if err = nodemanager.WaitNodesFinish([]string{controlPlaneNode.Address}, time.Minute*5); err != nil {
+ return nil, err
}
-
- //Step6: setup left nodes for cluster
- for _, node := range joinNodes {
- if err := handler.ClusterNodeJoin(node); err != nil {
- return err
+ if utils.IsType(controlPlaneNode.Type, api.Worker) {
+ controlPlaneNode.Type = utils.ClearType(controlPlaneNode.Type, api.Master)
+ if err = handler.ClusterNodeJoin(controlPlaneNode); err != nil {
+ return nil, err
}
}
- // wait all nodes ready
- if err := nodemanager.WaitNodesFinishWithProgress(joinNodeIDs, time.Minute*5); err != nil {
- return err
+
+ //Step6: setup left nodes for cluster
+ joinedNodeIDs, failedNodes, err := doJoinNodeOfCluster(handler, cc, masters, workers)
+ if err != nil {
+ return nil, err
}
//Step7: setup addons for cluster
- if err := handler.AddonsSetup(); err != nil {
- return err
+ if err = handler.AddonsSetup(); err != nil {
+ return nil, err
}
// Step8: run postcreate cluster hooks
- if err := handler.PostCreateClusterHooks(); err != nil {
- return err
+ if err = handler.PostCreateClusterHooks(); err != nil {
+ return nil, err
}
- allNodes := utils.GetAllIPs(cc.Nodes)
- if err := nodemanager.WaitNodesFinishWithProgress(allNodes, time.Minute*5); err != nil {
- return err
+ if err = nodemanager.WaitNodesFinishWithProgress(append(joinedNodeIDs, controlPlaneNode.Address), time.Minute*5); err != nil {
+ return nil, err
}
- return nil
+ for _, sid := range joinedNodeIDs {
+ cstatus.StatusOfNodes[sid] = true
+ cstatus.SuccessCnt += 1
+ }
+ cstatus.Working = true
+
+ return failedNodes, nil
}
-func CreateCluster(cc *api.ClusterConfig) error {
+func rollbackFailedNoeds(handler api.ClusterDeploymentAPI, nodes []*api.HostConfig) {
+ if nodes == nil {
+ return
+ }
+ var rollIDs []string
+ for _, n := range nodes {
+ // do best to cleanup, if error, just ignore
+ handler.ClusterNodeCleanup(n, n.Type)
+ handler.MachineInfraDestroy(n)
+ rollIDs = append(rollIDs, n.Address)
+ }
+
+ if err := nodemanager.WaitNodesFinishWithProgress(rollIDs, time.Minute*5); err != nil {
+ logrus.Warnf("rollback failed: %v", err)
+ }
+}
+
+func CreateCluster(cc *api.ClusterConfig) (api.ClusterStatus, error) {
+ cstatus := api.ClusterStatus{
+ StatusOfNodes: make(map[string]bool),
+ }
if cc == nil {
- return fmt.Errorf("[cluster] cluster config is required")
+ return cstatus, fmt.Errorf("[cluster] cluster config is required")
}
creator, err := manager.GetClusterDeploymentDriver(cc.DeployDriver)
if err != nil {
logrus.Errorf("[cluster] get cluster deployment driver: %s failed: %v", cc.DeployDriver, err)
- return err
+ return cstatus, err
}
handler, err := creator(cc)
if err != nil {
logrus.Errorf("[cluster] create cluster deployment instance with driver: %s, failed: %v", cc.DeployDriver, err)
- return err
+ return cstatus, err
}
defer handler.Finish()
// prepare eggo config directory
if err := os.MkdirAll(api.GetClusterHomePath(cc.Name), 0750); err != nil {
- return err
+ return cstatus, err
}
- if err := doCreateCluster(handler, cc); err != nil {
- return err
+ failedNodes, err := doCreateCluster(handler, cc, &cstatus)
+ if err != nil {
+ logrus.Warnf("rollback cluster: %s", cc.Name)
+ doRemoveCluster(handler, cc)
+ cstatus.Message = err.Error()
+ return cstatus, err
+ }
+ // rollback failed nodes
+ rollbackFailedNoeds(handler, failedNodes)
+ // update status of cluster
+ if failedNodes != nil {
+ var failureIDs []string
+ for _, fid := range failedNodes {
+ failureIDs = append(failureIDs, fid.Address)
+ cstatus.StatusOfNodes[fid.Address] = false
+ cstatus.FailureCnt += 1
+ }
+ logrus.Warnf("[cluster] failed nodes: %v", failureIDs)
+ cstatus.Message = "partial success of create cluster"
+ return cstatus, nil
}
- logrus.Infof("[cluster] create cluster '%s' successed", cc.Name)
- return nil
+ cstatus.Message = "create cluster success"
+ return cstatus, nil
}
func doJoinNode(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig, hostconfig *api.HostConfig) error {
diff --git a/pkg/utils/nodemanager/node.go b/pkg/utils/nodemanager/node.go
index d610e3b..a19d2e7 100644
--- a/pkg/utils/nodemanager/node.go
+++ b/pkg/utils/nodemanager/node.go
@@ -117,7 +117,7 @@ func (n *Node) updateNodeStatus(message string, status int) {
func (n *Node) PushTask(t task.Task) bool {
// only run ignore error tasks to cleanup node
- if n.status.HasError() && task.IsIgnoreError(t) {
+ if n.status.HasError() && !task.IsIgnoreError(t) {
logrus.Debugf("node finished with error: %v", n.status.Message)
return false
}
diff --git a/pkg/utils/nodemanager/nodemanager.go b/pkg/utils/nodemanager/nodemanager.go
index b135890..25f7d4e 100644
--- a/pkg/utils/nodemanager/nodemanager.go
+++ b/pkg/utils/nodemanager/nodemanager.go
@@ -37,6 +37,28 @@ var manager = &NodeManager{
nodes: make(map[string]*Node, 2),
}
+// return: key is node IP; value true is failed, false is success
+func CheckNodesStatus(checkNodes []string) ([]*api.HostConfig, []string) {
+ var failures []*api.HostConfig
+ var success []string
+ manager.lock.RLock()
+ defer manager.lock.RUnlock()
+ for _, cn := range checkNodes {
+ n, ok := manager.nodes[cn]
+ if !ok {
+ failures = append(failures, n.host)
+ continue
+ }
+ if n.GetStatus().HasError() {
+ failures = append(failures, n.host)
+ } else {
+ success = append(success, cn)
+ }
+ }
+
+ return failures, success
+}
+
func RegisterNode(hcf *api.HostConfig, r runner.Runner) error {
manager.lock.Lock()
defer manager.lock.Unlock()
@@ -227,7 +249,10 @@ outfor:
}
logrus.Infof("Tasks progress: %s", sb.String())
unfinishedNodes = nextUnfinished
- time.Sleep(time.Second)
+
+ // sleep time depend on count of wait nodes
+ st := len(unfinishedNodes) + 1
+ time.Sleep(time.Second * time.Duration(st))
}
}
diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go
index 72f8777..84ec0ea 100644
--- a/pkg/utils/utils.go
+++ b/pkg/utils/utils.go
@@ -54,6 +54,10 @@ func IsType(curType uint16, expectedType uint16) bool {
return (curType & expectedType) == expectedType
}
+func ClearType(curType uint16, clearType uint16) uint16 {
+ return (curType & ^clearType)
+}
+
func AddSudo(cmd string) string {
return "sudo -E /bin/sh -c \"" + cmd + "\""
}
--
2.25.1
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/cuiyili/eggo.git
git@gitee.com:cuiyili/eggo.git
cuiyili
eggo
eggo
master

搜索帮助