engine.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package alerting
  15. import (
  16. "context"
  17. "runtime/debug"
  18. "time"
  19. "github.com/benbjohnson/clock"
  20. "golang.org/x/sync/errgroup"
  21. "golang.org/x/xerrors"
  22. "yunion.io/x/log"
  23. "yunion.io/x/onecloud/pkg/mcclient/auth"
  24. "yunion.io/x/onecloud/pkg/monitor/options"
  25. "yunion.io/x/onecloud/pkg/monitor/registry"
  26. )
  27. // AlertEngine is the background process that
  28. // schedules alert evaluations and makes sure notifications
  29. // are sent.
  30. type AlertEngine struct {
  31. execQueue chan *Job
  32. ticker *Ticker
  33. Scheduler scheduler
  34. evalHandler evalHandler
  35. ruleReader ruleReader
  36. resultHandler resultHandler
  37. }
  38. func init() {
  39. registry.RegisterService(&AlertEngine{})
  40. }
  41. // IsDisabled returns true if the alerting service is disabled for this instance.
  42. func (e *AlertEngine) IsDisabled() bool {
  43. // TODO: read from config options
  44. return false
  45. }
  46. // Init initalizes the AlertingService.
  47. func (e *AlertEngine) Init() error {
  48. e.ticker = NewTicker(time.Now(), time.Second*0, clock.New())
  49. e.execQueue = make(chan *Job, 1000)
  50. e.Scheduler = newScheduler()
  51. e.evalHandler = NewEvalHandler()
  52. e.ruleReader = newRuleReader()
  53. e.resultHandler = newResultHandler()
  54. return nil
  55. }
  56. // Run starts the alerting service background process.
  57. func (e *AlertEngine) Run(ctx context.Context) error {
  58. alertGroup, ctx := errgroup.WithContext(ctx)
  59. alertGroup.Go(func() error { return e.alertingTicker(ctx) })
  60. alertGroup.Go(func() error { return e.runJobDispatcher(ctx) })
  61. err := alertGroup.Wait()
  62. return err
  63. }
  64. func (e *AlertEngine) alertingTicker(ctx context.Context) error {
  65. defer func() {
  66. if err := recover(); err != nil {
  67. log.Errorf("Scheduler panic: stopping alertingTicker, error: %v", err)
  68. debug.PrintStack()
  69. }
  70. }()
  71. tickIndex := 0
  72. for {
  73. select {
  74. case <-ctx.Done():
  75. return ctx.Err()
  76. case tick := <-e.ticker.C:
  77. // TEMP SOLUTION update rules ever tenth tick
  78. if tickIndex%10 == 0 {
  79. e.Scheduler.Update(e.ruleReader.fetch())
  80. }
  81. e.Scheduler.Tick(tick, e.execQueue)
  82. tickIndex++
  83. }
  84. }
  85. }
  86. func (e *AlertEngine) runJobDispatcher(ctx context.Context) error {
  87. dispatcherGroup, alertCtx := errgroup.WithContext(ctx)
  88. for {
  89. select {
  90. case <-ctx.Done():
  91. return dispatcherGroup.Wait()
  92. case job := <-e.execQueue:
  93. dispatcherGroup.Go(func() error { return e.processJobWithRetry(alertCtx, job) })
  94. }
  95. }
  96. }
  97. var (
  98. unfinishedWorkTimeout = time.Second * 5
  99. )
  100. func (e *AlertEngine) processJobWithRetry(ctx context.Context, job *Job) error {
  101. defer func() {
  102. if err := recover(); err != nil {
  103. log.Errorf("Alert panic, error: %v", err)
  104. }
  105. }()
  106. cancelChan := make(chan context.CancelFunc, options.Options.AlertingMaxAttempts*2)
  107. attemptChan := make(chan int, 1)
  108. // Initialize with first attemptID=1
  109. attemptChan <- 1
  110. job.SetRunning(true)
  111. for {
  112. select {
  113. case <-ctx.Done():
  114. // In case monitor server is cancel, let a chance to job processing
  115. // to finish gracefully - by waiting a timeout duration -
  116. unfinishedWorkTimer := time.NewTimer(unfinishedWorkTimeout)
  117. select {
  118. case <-unfinishedWorkTimer.C:
  119. return e.endJob(ctx.Err(), cancelChan, job)
  120. case <-attemptChan:
  121. return e.endJob(nil, cancelChan, job)
  122. }
  123. case attemptId, more := <-attemptChan:
  124. if !more {
  125. return e.endJob(nil, cancelChan, job)
  126. }
  127. go e.processJob(attemptId, attemptChan, cancelChan, job)
  128. }
  129. }
  130. }
  131. func (e *AlertEngine) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
  132. job.SetRunning(false)
  133. close(cancelChan)
  134. for cancelFn := range cancelChan {
  135. cancelFn()
  136. }
  137. return err
  138. }
  139. func (e *AlertEngine) processJob(attemptID int, attemptChan chan int, cancelChan chan context.CancelFunc, job *Job) {
  140. defer func() {
  141. if err := recover(); err != nil {
  142. log.Errorf("Alert Panic: error: %v", err)
  143. }
  144. }()
  145. alertCtx, cancelFn := context.WithTimeout(context.Background(), time.Duration(options.Options.AlertingEvaluationTimeoutSeconds)*time.Second)
  146. cancelChan <- cancelFn
  147. // span := opentracing.StartSpan("alert execution")
  148. // alertCtx = opentracing.ContextWithSpan(alertCtx, span)
  149. evalContext := NewEvalContext(alertCtx, auth.AdminCredential(), job.Rule)
  150. evalContext.Ctx = alertCtx
  151. go func() {
  152. defer func() {
  153. if err := recover(); err != nil {
  154. log.Errorf("Alert panic, error: %v", err)
  155. debug.PrintStack()
  156. // ext.Error.Set(span, true)
  157. // span.LogFields(
  158. // tlog.Error(fmt.Errorf("%v", err)),
  159. // tlog.String("message", "failed to execute alert rule. panic was recovered."),
  160. //)
  161. //span.Finish()
  162. close(attemptChan)
  163. }
  164. }()
  165. e.evalHandler.Eval(evalContext)
  166. /*span.SetTag("alertId", evalContext.Rule.ID)
  167. span.SetTag("dashboardId", evalContext.Rule.DashboardID)
  168. span.SetTag("firing", evalContext.Firing)
  169. span.SetTag("nodatapoints", evalContext.NoDataFound)
  170. span.SetTag("attemptID", attemptID)*/
  171. if evalContext.Error != nil {
  172. /*ext.Error.Set(span, true)
  173. span.LogFields(
  174. tlog.Error(evalContext.Error),
  175. tlog.String("message", "alerting execution attempt failed"),
  176. )
  177. */
  178. if attemptID < options.Options.AlertingMaxAttempts {
  179. // span.Finish(
  180. log.Warningf("Job Execution attempt triggered retry, timeMs: %v, alertId: %d",
  181. evalContext.GetDurationMs(), attemptID)
  182. attemptChan <- (attemptID + 1)
  183. return
  184. }
  185. log.Errorf("gt AlertingMaxAttempts, error: %v", evalContext.Error)
  186. close(attemptChan)
  187. return
  188. }
  189. // create new context with timeout for notifications
  190. resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), time.Duration(options.Options.AlertingNotificationTimeoutSeconds)*time.Second)
  191. cancelChan <- resultHandleCancelFn
  192. // override the context used for evaluation with a new context for notifications.
  193. // This makes it possible for notifiers to execute when datasources
  194. // don't respond within the timeout limit. We should rewrite this so notifications
  195. // don't reuse the evalContext and get its own context.
  196. evalContext.Ctx = resultHandleCtx
  197. evalContext.Rule.State = evalContext.GetNewState()
  198. if err := e.resultHandler.handle(evalContext); err != nil {
  199. if xerrors.Is(err, context.Canceled) {
  200. log.Warningf("Result handler returned context.Canceled")
  201. } else if xerrors.Is(err, context.DeadlineExceeded) {
  202. log.Warningf("Result handler returned context.DeadlineExceeded")
  203. } else {
  204. log.Errorf("Failed to handle result: %v", err)
  205. }
  206. }
  207. // span.Finish()
  208. log.Debugf("Job execution completed, timeMs: %v, alertId: %s, attemptId: %d", evalContext.GetDurationMs(), evalContext.Rule.Id, attemptID)
  209. close(attemptChan)
  210. }()
  211. }