health_manager.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package host_health
  15. import (
  16. "context"
  17. "fmt"
  18. "io/ioutil"
  19. "path"
  20. "sync"
  21. "time"
  22. "yunion.io/x/log"
  23. "yunion.io/x/pkg/errors"
  24. api "yunion.io/x/onecloud/pkg/apis/compute"
  25. "yunion.io/x/onecloud/pkg/cloudcommon/etcd"
  26. common_options "yunion.io/x/onecloud/pkg/cloudcommon/options"
  27. "yunion.io/x/onecloud/pkg/cloudmon/misc"
  28. "yunion.io/x/onecloud/pkg/hostman/hostinfo/hostconsts"
  29. "yunion.io/x/onecloud/pkg/hostman/hostutils"
  30. "yunion.io/x/onecloud/pkg/hostman/options"
  31. modules "yunion.io/x/onecloud/pkg/mcclient/modules/compute"
  32. "yunion.io/x/onecloud/pkg/util/fileutils2"
  33. "yunion.io/x/onecloud/pkg/util/procutils"
  34. )
  35. type SHostHealthManager struct {
  36. cli *etcd.SEtcdClient
  37. timeout int
  38. requestExpend int
  39. hostId string
  40. status StatusManager
  41. masterNodesIps []string
  42. }
  43. type StatusManager struct {
  44. status string
  45. statusLock sync.Mutex
  46. }
  47. func (m *StatusManager) GetStatus() string {
  48. m.statusLock.Lock()
  49. defer m.statusLock.Unlock()
  50. return m.status
  51. }
  52. func (m *StatusManager) CheckAndSetStatus(status string) bool {
  53. m.statusLock.Lock()
  54. defer m.statusLock.Unlock()
  55. if status == m.status {
  56. return false
  57. }
  58. m.status = status
  59. return true
  60. }
  61. func (m *StatusManager) SetStatus(status string) {
  62. m.statusLock.Lock()
  63. defer m.statusLock.Unlock()
  64. m.status = status
  65. }
  66. var (
  67. manager *SHostHealthManager
  68. )
  69. func InitHostHealthManager(hostId string) (*SHostHealthManager, error) {
  70. if manager != nil {
  71. return manager, nil
  72. }
  73. var m = SHostHealthManager{}
  74. masterNodesIps, err := m.masterNodesInternalIps()
  75. if err != nil {
  76. return nil, err
  77. } else if len(masterNodesIps) == 0 {
  78. return nil, errors.Errorf("failed get k8s master nodes")
  79. }
  80. m.masterNodesIps = masterNodesIps
  81. var dialTimeout, requestTimeout = 3, 2
  82. cfg, err := NewEtcdOptions(
  83. &options.HostOptions.EtcdOptions,
  84. options.HostOptions.HostLeaseTimeout,
  85. dialTimeout, requestTimeout,
  86. )
  87. if err != nil {
  88. return nil, err
  89. }
  90. err = etcd.InitDefaultEtcdClient(cfg, m.OnKeepaliveFailure)
  91. if err != nil {
  92. return nil, errors.Wrap(err, "init default etcd client")
  93. }
  94. m.cli = etcd.Default()
  95. m.hostId = hostId
  96. m.requestExpend = requestTimeout
  97. m.timeout = options.HostOptions.HostHealthTimeout - options.HostOptions.HostLeaseTimeout
  98. if err := m.StartHealthCheck(); err != nil {
  99. return nil, err
  100. }
  101. log.Infof("put key %s success", m.GetKey())
  102. m.status.SetStatus(api.HOST_HEALTH_STATUS_RUNNING)
  103. manager = &m
  104. return manager, nil
  105. }
  106. func NewEtcdOptions(
  107. opt *common_options.EtcdOptions, leaseTimeout, dialTimeout, requestTimeout int,
  108. ) (*etcd.SEtcdOptions, error) {
  109. cfg, err := opt.GetEtcdTLSConfig()
  110. if err != nil {
  111. return nil, err
  112. }
  113. return &etcd.SEtcdOptions{
  114. EtcdEndpoint: opt.EtcdEndpoints,
  115. EtcdLeaseExpireSeconds: leaseTimeout,
  116. EtcdTimeoutSeconds: dialTimeout,
  117. EtcdRequestTimeoutSeconds: requestTimeout,
  118. EtcdEnabldSsl: opt.EtcdUseTLS,
  119. TLSConfig: cfg,
  120. }, nil
  121. }
  122. func (m *SHostHealthManager) StartHealthCheck() error {
  123. return m.cli.PutSession(context.Background(),
  124. m.GetKey(), api.HOST_HEALTH_STATUS_RUNNING,
  125. )
  126. }
  127. func (m *SHostHealthManager) GetKey() string {
  128. return fmt.Sprintf("%s/%s", api.HOST_HEALTH_PREFIX, m.hostId)
  129. }
  130. func (m *SHostHealthManager) OnKeepaliveFailure() {
  131. if !m.status.CheckAndSetStatus(api.HOST_HEALTH_STATUS_RECONNECTING) {
  132. log.Warningf("OnKeepaliveFailure status already %s", api.HOST_HEALTH_STATUS_RECONNECTING)
  133. return
  134. }
  135. m.status.SetStatus(api.HOST_HEALTH_STATUS_RECONNECTING)
  136. ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(m.timeout))
  137. defer cancel()
  138. err := m.cli.RestartSessionWithContext(ctx)
  139. if err == nil {
  140. if err := m.cli.PutSession(context.Background(),
  141. m.GetKey(), api.HOST_HEALTH_STATUS_RUNNING,
  142. ); err != nil {
  143. log.Errorf("put host key failed %s", err)
  144. } else {
  145. m.status.SetStatus(api.HOST_HEALTH_STATUS_RUNNING)
  146. log.Infof("etcd client restart session put %s success", m.GetKey())
  147. return
  148. }
  149. }
  150. log.Errorf("keep etcd lease failed: %s", err)
  151. if m.networkAvailable() {
  152. log.Infof("network is available, try reconnect")
  153. // may be etcd not work
  154. m.Reconnect()
  155. } else {
  156. log.Errorf("netwrok is unavailable, going to shutdown servers")
  157. m.status.SetStatus(api.HOST_HEALTH_STATUS_UNKNOWN)
  158. m.OnUnhealth()
  159. }
  160. }
  161. func (m *SHostHealthManager) networkAvailable() bool {
  162. res, err := misc.Ping(m.masterNodesIps, 3, 10, false)
  163. if err != nil {
  164. log.Errorf("failed ping master nodes %s", res)
  165. return true
  166. }
  167. for _, v := range res {
  168. if v.Loss() < 100 {
  169. return true
  170. }
  171. }
  172. return false
  173. }
  174. func (m *SHostHealthManager) masterNodesInternalIps() ([]string, error) {
  175. result, err := modules.Hosts.Get(hostutils.GetComputeSession(context.Background()), "k8s-master-node-ips", nil)
  176. if err != nil {
  177. return nil, err
  178. }
  179. ips := make([]string, 0)
  180. err = result.Unmarshal(&ips, "ips")
  181. if err != nil {
  182. return nil, errors.Wrap(err, "unmarshal master node ips")
  183. }
  184. return ips, nil
  185. }
  186. func (m *SHostHealthManager) OnUnhealth() {
  187. p := path.Join(options.HostOptions.ServersPath, hostconsts.HOST_HEALTH_FILENAME)
  188. if fileutils2.Exists(p) {
  189. if act, err := fileutils2.FileGetContents(p); err != nil {
  190. log.Errorf(" failed read file %s: %s", p, err)
  191. } else if act == hostconsts.SHUTDOWN_SERVERS {
  192. log.Errorf("Host unhealthy, going to shutdown servers")
  193. m.shutdownServers()
  194. }
  195. }
  196. // reconnect wait for network available
  197. m.Reconnect()
  198. }
  199. func (m *SHostHealthManager) Reconnect() {
  200. if m.cli.SessionLiving() {
  201. m.status.SetStatus(api.HOST_HEALTH_STATUS_RUNNING)
  202. return
  203. }
  204. idx := 0
  205. for {
  206. if err := m.doReconnect(); err != nil {
  207. log.Errorf("failed do_reconnect %s, reconnect after %d seconds", err, idx)
  208. time.Sleep(time.Duration(idx) * time.Second)
  209. if idx < 5 {
  210. idx += 1
  211. }
  212. continue
  213. }
  214. break
  215. }
  216. m.status.SetStatus(api.HOST_HEALTH_STATUS_RUNNING)
  217. }
  218. func (m *SHostHealthManager) doReconnect() error {
  219. ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
  220. defer cancel()
  221. if err := m.cli.RestartSessionWithContext(ctx); err != nil && !m.cli.SessionLiving() {
  222. return errors.Wrap(err, "RestartSessionWithContext")
  223. }
  224. log.Infof("restart ression success")
  225. // put session use client default timeout
  226. if err := m.cli.PutSession(context.Background(), m.GetKey(), api.HOST_HEALTH_STATUS_RUNNING); err != nil {
  227. return errors.Wrap(err, "PutSession")
  228. }
  229. log.Infof("put key %s success", m.GetKey())
  230. return nil
  231. }
  232. func (m *SHostHealthManager) shutdownServers() {
  233. files, err := ioutil.ReadDir(options.HostOptions.ServersPath)
  234. if err != nil {
  235. log.Errorf("failed walk dir %s: %s", options.HostOptions.ServersPath, err)
  236. return
  237. }
  238. for i := range files {
  239. if hostutils.IsGuestDir(files[i], options.HostOptions.ServersPath) {
  240. stopvm := path.Join(options.HostOptions.ServersPath, files[i].Name(), "stopvm")
  241. if fileutils2.Exists(stopvm) {
  242. log.Infof("start exec stopvm script for guest %s", files[i].Name())
  243. out, err := procutils.NewRemoteCommandAsFarAsPossible("bash", stopvm, "--force").Output()
  244. if err != nil {
  245. log.Errorf("failed exec stopvm script for guest %s: %s %s", files[i].Name(), out, err)
  246. }
  247. }
  248. }
  249. }
  250. }
  251. func GetHealthStatus() string {
  252. if manager == nil {
  253. return ""
  254. }
  255. return manager.status.GetStatus()
  256. }