eval_context.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // Copyright 2019 Yunion
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package alerting
  15. import (
  16. "context"
  17. "fmt"
  18. "sort"
  19. "strings"
  20. "time"
  21. "yunion.io/x/jsonutils"
  22. "yunion.io/x/log"
  23. "yunion.io/x/pkg/util/sets"
  24. "yunion.io/x/onecloud/pkg/apis/monitor"
  25. "yunion.io/x/onecloud/pkg/mcclient"
  26. "yunion.io/x/onecloud/pkg/mcclient/auth"
  27. modules "yunion.io/x/onecloud/pkg/mcclient/modules/identity"
  28. "yunion.io/x/onecloud/pkg/monitor/options"
  29. )
  30. // EvalContext is the context object for an alert evaluation.
  31. type EvalContext struct {
  32. Firing bool
  33. IsTestRun bool
  34. IsDebug bool
  35. EvalMatches []*monitor.EvalMatch
  36. AlertOkEvalMatches []*monitor.EvalMatch
  37. Logs []*monitor.ResultLogEntry
  38. Error error
  39. ConditionEvals string
  40. StartTime time.Time
  41. EndTime time.Time
  42. Rule *Rule
  43. NoDataFound bool
  44. PrevAlertState monitor.AlertStateType
  45. Ctx context.Context
  46. UserCred mcclient.TokenCredential
  47. }
  48. // NewEvalContext is the EvalContext constructor.
  49. func NewEvalContext(alertCtx context.Context, userCred mcclient.TokenCredential, rule *Rule) *EvalContext {
  50. return &EvalContext{
  51. Ctx: alertCtx,
  52. UserCred: userCred,
  53. StartTime: time.Now(),
  54. Rule: rule,
  55. EvalMatches: make([]*monitor.EvalMatch, 0),
  56. AlertOkEvalMatches: make([]*monitor.EvalMatch, 0),
  57. PrevAlertState: rule.State,
  58. }
  59. }
  60. // SateDescription contains visual information about the alert state.
  61. type StateDescription struct {
  62. //Color string
  63. Text string
  64. Data string
  65. }
  66. // GetStateModel returns the `StateDescription` based on current state.
  67. func (c *EvalContext) GetStateModel() *StateDescription {
  68. switch c.Rule.State {
  69. case monitor.AlertStateOK:
  70. return &StateDescription{
  71. Text: "OK",
  72. }
  73. case monitor.AlertStateNoData:
  74. return &StateDescription{
  75. Text: "No Data",
  76. }
  77. case monitor.AlertStateAlerting:
  78. return &StateDescription{
  79. Text: "Alerting",
  80. }
  81. case monitor.AlertStateUnknown:
  82. return &StateDescription{
  83. Text: "Unknown",
  84. }
  85. default:
  86. panic(fmt.Sprintf("Unknown rule state %q for alert %s", c.Rule.State, c.Rule.Name))
  87. }
  88. }
  89. func (c *EvalContext) shouldUpdateAlertState() bool {
  90. return c.Rule.State != c.PrevAlertState || c.Rule.State == monitor.AlertStateAlerting
  91. }
  92. // GetDurationMs returns the duration of the alert evaluation.
  93. func (c *EvalContext) GetDurationMs() float64 {
  94. return float64(c.EndTime.Nanosecond()-c.StartTime.Nanosecond()) / float64(1000000)
  95. }
  96. func (c *EvalContext) GetRuleTitle() string {
  97. rule := c.Rule
  98. if rule.Title != "" {
  99. return rule.Title
  100. }
  101. return rule.Name
  102. }
  103. // GetNotificationTitle returns the title of the alert rule including alert state.
  104. func (c *EvalContext) GetNotificationTitle() string {
  105. return "[" + c.GetStateModel().Text + "] " + c.GetRuleTitle()
  106. }
  107. func (c *EvalContext) GetCallbackURLPrefix() string {
  108. config, err := modules.ServicesV3.GetSpecific(auth.GetAdminSession(c.Ctx, ""), "common", "config",
  109. jsonutils.NewDict())
  110. if err != nil {
  111. log.Errorf("GetCallbackURLPrefix err:%v", err)
  112. return ""
  113. }
  114. url, _ := config.GetString("config", "default", "api_server")
  115. defaultWebUri := "alertrecord"
  116. matchTag := map[string]string{}
  117. if c.Firing {
  118. matchTag = c.EvalMatches[0].Tags
  119. } else {
  120. matchTag = c.AlertOkEvalMatches[0].Tags
  121. }
  122. if uri, ok := matchTag["web_url"]; ok {
  123. defaultWebUri = uri
  124. }
  125. return fmt.Sprintf("%s/%s", url, defaultWebUri)
  126. }
  127. // GetNewState returns the new state from the alert rule evaluation.
  128. func (c *EvalContext) GetNewState() monitor.AlertStateType {
  129. ns := getNewStateInternal(c)
  130. if ns != monitor.AlertStateAlerting || c.Rule.For == 0 {
  131. return ns
  132. }
  133. since := time.Since(c.Rule.LastStateChange)
  134. if c.PrevAlertState == monitor.AlertStatePending && since/time.Second >= c.Rule.For {
  135. return monitor.AlertStateAlerting
  136. }
  137. if c.Rule.For != 0 {
  138. log.Errorf("ruleName:%s,since:%d,for:%d", c.Rule.Name, since/time.Second, c.Rule.For)
  139. }
  140. if ns == monitor.AlertStateAlerting && since/time.Second >= c.Rule.For {
  141. return monitor.AlertStateAlerting
  142. }
  143. return monitor.AlertStatePending
  144. }
  145. func getNewStateInternal(c *EvalContext) monitor.AlertStateType {
  146. if c.Error != nil {
  147. log.Errorf("Alert Rule Result Error, ruleId: %s, name: %s, error: %v, changing state to %v",
  148. c.Rule.Id,
  149. c.Rule.Name,
  150. c.Error,
  151. c.Rule.ExecutionErrorState.ToAlertState())
  152. if c.Rule.ExecutionErrorState == monitor.ExecutionErrorKeepState {
  153. return c.PrevAlertState
  154. }
  155. return c.Rule.ExecutionErrorState.ToAlertState()
  156. }
  157. if c.Firing {
  158. return monitor.AlertStateAlerting
  159. }
  160. if c.NoDataFound {
  161. log.Infof("Alert Rule returned no data, ruleId: %s, name: %s, changing state to %v",
  162. c.Rule.Id,
  163. c.Rule.Name,
  164. c.Rule.NoDataState.ToAlertState())
  165. if c.Rule.NoDataState == monitor.NoDataKeepState {
  166. return c.PrevAlertState
  167. }
  168. return c.Rule.NoDataState.ToAlertState()
  169. }
  170. return monitor.AlertStateOK
  171. }
  172. func (c *EvalContext) GetNotificationTemplateConfig(matches []*monitor.EvalMatch) monitor.NotificationTemplateConfig {
  173. desc := c.Rule.Message
  174. // 优先根据当前 matches 中的 Condition 生成触发条件描述,确保与本次告警/恢复的指标一致
  175. if len(matches) > 0 && matches[0] != nil && matches[0].Condition != "" {
  176. condSet := sets.NewString()
  177. conds := make([]string, 0, len(matches))
  178. for _, m := range matches {
  179. if m == nil || m.Condition == "" {
  180. continue
  181. }
  182. if condSet.Has(m.Condition) {
  183. continue
  184. }
  185. condSet.Insert(m.Condition)
  186. conds = append(conds, m.Condition)
  187. }
  188. if len(conds) > 0 {
  189. desc = strings.Join(conds, " ")
  190. log.Debugf("[GetNotificationTemplateConfig] rule=%s matches=%d desc from match conditions: %s", c.Rule.Name, len(matches), desc)
  191. }
  192. } else if len(c.Rule.TriggeredMessages) > 0 {
  193. // 兼容旧逻辑:如果没有按 match 填充 Condition,则退回到规则级 TriggeredMessages
  194. desc = strings.Join(c.Rule.TriggeredMessages, " ")
  195. log.Debugf("[GetNotificationTemplateConfig] rule=%s matches=%d desc from TriggeredMessages (fallback): %s", c.Rule.Name, len(matches), desc)
  196. }
  197. if c.Error != nil {
  198. if desc != "" {
  199. desc += "\n"
  200. }
  201. desc += "Error: " + c.Error.Error()
  202. }
  203. tz, _ := time.LoadLocation(options.Options.TimeZone)
  204. cfg := monitor.NotificationTemplateConfig{
  205. Title: c.GetNotificationTitle(),
  206. Name: c.Rule.Name,
  207. ResourceName: c.GetResourceNameOfMatches(matches),
  208. Matches: matches,
  209. MatchTags: make([]map[string]string, len(matches)),
  210. MatchTagsStr: make([]string, len(matches)),
  211. StartTime: c.StartTime.In(tz).Format("2006-01-02 15:04:05"),
  212. EndTime: c.EndTime.In(tz).Format("2006-01-02 15:04:05"),
  213. Description: desc,
  214. Reason: c.Rule.Reason,
  215. Level: c.Rule.Level,
  216. NoDataFound: c.NoDataFound,
  217. WebUrl: c.GetCallbackURLPrefix(),
  218. }
  219. // calculate match tags
  220. diffKeySets := make(map[string]sets.String)
  221. for i := range cfg.Matches {
  222. m := cfg.Matches[i]
  223. for mk, mv := range m.Tags {
  224. if _, ok := diffKeySets[mk]; !ok {
  225. diffKeySets[mk] = sets.NewString()
  226. }
  227. if sets.NewString("name", "host", "host_id", "ip", "host_id", "vm_id", "access_ip").Has(mk) {
  228. continue
  229. }
  230. diffKeySets[mk].Insert(mv)
  231. }
  232. }
  233. for i := range cfg.Matches {
  234. m := cfg.Matches[i]
  235. cfg.MatchTags[i] = make(map[string]string)
  236. for diffKey, s := range diffKeySets {
  237. if s.Len() > 1 {
  238. cfg.MatchTags[i][diffKey] = m.Tags[diffKey]
  239. }
  240. }
  241. cfg.MatchTagsStr[i] = jsonutils.Marshal(cfg.MatchTags[i]).String()
  242. }
  243. return cfg
  244. }
  245. func (c *EvalContext) GetEvalMatches() []*monitor.EvalMatch {
  246. ret := make([]*monitor.EvalMatch, 0)
  247. matches := c.EvalMatches
  248. for i, c := range matches {
  249. if _, ok := c.Tags[monitor.ALERT_RESOURCE_RECORD_SHIELD_KEY]; ok {
  250. continue
  251. }
  252. ret = append(ret, matches[i])
  253. }
  254. return ret
  255. }
  256. func (c *EvalContext) getTagsDesc(tags map[string]string) string {
  257. strs := make([]string, 0)
  258. for k, v := range tags {
  259. if v == "" {
  260. continue
  261. }
  262. strs = append(strs, k+"="+v)
  263. }
  264. sort.Strings(strs)
  265. ret := strings.Join(strs, ",")
  266. return "{" + ret + "}"
  267. }
  268. func (c *EvalContext) GetResourceNameOfMatches(matches []*monitor.EvalMatch) string {
  269. names := strings.Builder{}
  270. names.WriteString("\n")
  271. for i, match := range matches {
  272. if name, ok := match.Tags["name"]; ok && name != "" {
  273. names.WriteString(fmt.Sprintf("- %s.%s: %s", name, match.Metric, match.ValueStr))
  274. } else {
  275. names.WriteString(fmt.Sprintf("- %s%s: %s", match.Metric, c.getTagsDesc(match.Tags), match.ValueStr))
  276. }
  277. if i < len(matches)-1 {
  278. names.WriteString("\n")
  279. }
  280. }
  281. return names.String()
  282. }
  283. func (c *EvalContext) GetRecoveredMatches() []*monitor.EvalMatch {
  284. ret := make([]*monitor.EvalMatch, 0)
  285. for i := range c.AlertOkEvalMatches {
  286. m := c.AlertOkEvalMatches[i]
  287. if m.IsRecovery {
  288. ret = append(ret, m)
  289. }
  290. }
  291. return ret
  292. }
  293. func (c *EvalContext) HasRecoveredMatches() bool {
  294. return len(c.GetRecoveredMatches()) != 0
  295. }