复制 // 告警规则
type AlertRule struct {
ID int `json:"id" gorm:"primaryKey"`
Name string `json:"name" gorm:"uniqueIndex"`
Description string `json:"description"`
MetricName string `json:"metric_name" gorm:"index"`
Operator string `json:"operator"` // >, <, >=, <=, ==, !=
Threshold float64 `json:"threshold"`
Duration int `json:"duration"` // 持续时间(秒)
Severity string `json:"severity"` // critical, warning, info
Enabled bool `json:"enabled" gorm:"default:true"`
// 通知配置
NotifyChannels []string `json:"notify_channels" gorm:"type:json"`
// 时间配置
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// 告警事件
type AlertEvent struct {
ID int `json:"id" gorm:"primaryKey"`
RuleID int `json:"rule_id" gorm:"index"`
RuleName string `json:"rule_name"`
MetricName string `json:"metric_name"`
MetricValue float64 `json:"metric_value"`
Threshold float64 `json:"threshold"`
Severity string `json:"severity"`
Status string `json:"status"` // firing, resolved
Message string `json:"message"`
// 时间信息
StartTime time.Time `json:"start_time"`
EndTime *time.Time `json:"end_time,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
// 关联
Rule *AlertRule `json:"rule,omitempty" gorm:"foreignKey:RuleID"`
}
// 告警管理器
type AlertManager struct {
rules map[int]*AlertRule
events map[int]*AlertEvent
collector *MetricsCollector
notifier *AlertNotifier
logger Logger
// 状态跟踪
ruleStates map[int]*RuleState
// 控制
stopChan chan struct{}
running bool
mutex sync.RWMutex
// 配置
checkInterval time.Duration
}
// 规则状态
type RuleState struct {
LastCheck time.Time
LastValue float64
AlertStart *time.Time
EventID *int
}
// 创建告警管理器
func NewAlertManager(collector *MetricsCollector, notifier *AlertNotifier, logger Logger) *AlertManager {
return &AlertManager{
rules: make(map[int]*AlertRule),
events: make(map[int]*AlertEvent),
collector: collector,
notifier: notifier,
logger: logger,
ruleStates: make(map[int]*RuleState),
stopChan: make(chan struct{}),
checkInterval: 30 * time.Second,
}
}
// 加载告警规则
func (am *AlertManager) LoadRules() error {
var rules []AlertRule
if err := common.DB.Where("enabled = ?", true).Find(&rules).Error; err != nil {
return err
}
am.mutex.Lock()
defer am.mutex.Unlock()
for _, rule := range rules {
am.rules[rule.ID] = &rule
if _, exists := am.ruleStates[rule.ID]; !exists {
am.ruleStates[rule.ID] = &RuleState{
LastCheck: time.Now(),
}
}
}
am.logger.Info("告警规则加载完成", Int("count", len(rules)))
return nil
}
// 启动告警检查
func (am *AlertManager) Start() {
am.mutex.Lock()
defer am.mutex.Unlock()
if am.running {
return
}
am.running = true
go am.checkLoop()
am.logger.Info("告警管理器已启动")
}
// 停止告警检查
func (am *AlertManager) Stop() {
am.mutex.Lock()
defer am.mutex.Unlock()
if !am.running {
return
}
am.running = false
close(am.stopChan)
am.logger.Info("告警管理器已停止")
}
// 检查循环
func (am *AlertManager) checkLoop() {
ticker := time.NewTicker(am.checkInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
am.checkRules()
case <-am.stopChan:
return
}
}
}
// 检查所有规则
func (am *AlertManager) checkRules() {
am.mutex.RLock()
rules := make(map[int]*AlertRule)
for k, v := range am.rules {
rules[k] = v
}
am.mutex.RUnlock()
for ruleID, rule := range rules {
am.checkRule(ruleID, rule)
}
}
// 检查单个规则
func (am *AlertManager) checkRule(ruleID int, rule *AlertRule) {
// 获取指标值
metric := am.collector.GetMetric(rule.MetricName)
if metric == nil {
am.logger.Warn("指标不存在", String("metric", rule.MetricName))
return
}
// 获取规则状态
am.mutex.Lock()
state, exists := am.ruleStates[ruleID]
if !exists {
state = &RuleState{LastCheck: time.Now()}
am.ruleStates[ruleID] = state
}
am.mutex.Unlock()
// 更新状态
state.LastCheck = time.Now()
state.LastValue = metric.Value
// 检查阈值
triggered := am.evaluateCondition(metric.Value, rule.Operator, rule.Threshold)
if triggered {
// 触发告警
if state.AlertStart == nil {
// 开始新的告警
now := time.Now()
state.AlertStart = &now
} else {
// 检查持续时间
duration := time.Since(*state.AlertStart)
if duration >= time.Duration(rule.Duration)*time.Second {
// 满足持续时间条件,触发告警
am.fireAlert(ruleID, rule, metric.Value, state)
}
}
} else {
// 未触发告警
if state.AlertStart != nil {
// 解决告警
am.resolveAlert(ruleID, rule, state)
state.AlertStart = nil
}
}
}
// 评估条件
func (am *AlertManager) evaluateCondition(value float64, operator string, threshold float64) bool {
switch operator {
case ">":
return value > threshold
case "<":
return value < threshold
case ">=":
return value >= threshold
case "<=":
return value <= threshold
case "==":
return value == threshold
case "!=":
return value != threshold
default:
return false
}
}
// 触发告警
func (am *AlertManager) fireAlert(ruleID int, rule *AlertRule, value float64, state *RuleState) {
// 检查是否已经有活跃的告警事件
if state.EventID != nil {
return
}
// 创建告警事件
event := &AlertEvent{
RuleID: ruleID,
RuleName: rule.Name,
MetricName: rule.MetricName,
MetricValue: value,
Threshold: rule.Threshold,
Severity: rule.Severity,
Status: "firing",
Message: fmt.Sprintf("指标 %s 当前值 %.2f %s 阈值 %.2f", rule.MetricName, value, rule.Operator, rule.Threshold),
StartTime: time.Now(),
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
// 保存到数据库
if err := common.DB.Create(event).Error; err != nil {
am.logger.Error("保存告警事件失败", Error(err))
return
}
// 更新状态
state.EventID = &event.ID
// 缓存事件
am.mutex.Lock()
am.events[event.ID] = event
am.mutex.Unlock()
// 发送通知
am.notifier.SendAlert(event, rule.NotifyChannels)
am.logger.Warn("告警触发",
String("rule", rule.Name),
String("metric", rule.MetricName),
Float64("value", value),
Float64("threshold", rule.Threshold))
}
// 解决告警
func (am *AlertManager) resolveAlert(ruleID int, rule *AlertRule, state *RuleState) {
if state.EventID == nil {
return
}
// 更新告警事件
now := time.Now()
err := common.DB.Model(&AlertEvent{}).
Where("id = ?", *state.EventID).
Updates(map[string]interface{}{
"status": "resolved",
"end_time": &now,
"updated_at": now,
}).Error
if err != nil {
am.logger.Error("更新告警事件失败", Error(err))
return
}
// 获取事件详情
am.mutex.RLock()
event, exists := am.events[*state.EventID]
am.mutex.RUnlock()
if exists {
event.Status = "resolved"
event.EndTime = &now
event.UpdatedAt = now
// 发送解决通知
am.notifier.SendResolved(event, rule.NotifyChannels)
}
// 清理状态
state.EventID = nil
am.logger.Info("告警解决",
String("rule", rule.Name),
String("metric", rule.MetricName))
}
// 告警通知器
type AlertNotifier struct {
channels map[string]NotifyChannel
logger Logger
}
// 通知渠道接口
type NotifyChannel interface {
SendAlert(event *AlertEvent) error
SendResolved(event *AlertEvent) error
}
// 创建告警通知器
func NewAlertNotifier(logger Logger) *AlertNotifier {
return &AlertNotifier{
channels: make(map[string]NotifyChannel),
logger: logger,
}
}
// 注册通知渠道
func (an *AlertNotifier) RegisterChannel(name string, channel NotifyChannel) {
an.channels[name] = channel
}
// 发送告警通知
func (an *AlertNotifier) SendAlert(event *AlertEvent, channelNames []string) {
for _, name := range channelNames {
if channel, exists := an.channels[name]; exists {
go func(ch NotifyChannel, evt *AlertEvent) {
if err := ch.SendAlert(evt); err != nil {
an.logger.Error("发送告警通知失败",
String("channel", name),
Error(err))
}
}(channel, event)
}
}
}
// 发送解决通知
func (an *AlertNotifier) SendResolved(event *AlertEvent, channelNames []string) {
for _, name := range channelNames {
if channel, exists := an.channels[name]; exists {
go func(ch NotifyChannel, evt *AlertEvent) {
if err := ch.SendResolved(evt); err != nil {
an.logger.Error("发送解决通知失败",
String("channel", name),
Error(err))
}
}(channel, event)
}
}
}
// 邮件通知渠道
type EmailNotifier struct {
smtpHost string
smtpPort int
username string
password string
fromAddress string
toAddresses []string
}
// 创建邮件通知器
func NewEmailNotifier(smtpHost string, smtpPort int, username, password, fromAddress string, toAddresses []string) *EmailNotifier {
return &EmailNotifier{
smtpHost: smtpHost,
smtpPort: smtpPort,
username: username,
password: password,
fromAddress: fromAddress,
toAddresses: toAddresses,
}
}
// 发送告警邮件
func (en *EmailNotifier) SendAlert(event *AlertEvent) error {
subject := fmt.Sprintf("[%s] 告警: %s", strings.ToUpper(event.Severity), event.RuleName)
body := fmt.Sprintf(`
告警详情:
规则名称: %s
指标名称: %s
当前值: %.2f
阈值: %.2f
严重程度: %s
开始时间: %s
消息: %s
`,
event.RuleName,
event.MetricName,
event.MetricValue,
event.Threshold,
event.Severity,
event.StartTime.Format("2006-01-02 15:04:05"),
event.Message)
return en.sendEmail(subject, body)
}
// 发送解决邮件
func (en *EmailNotifier) SendResolved(event *AlertEvent) error {
subject := fmt.Sprintf("[RESOLVED] 告警解决: %s", event.RuleName)
duration := ""
if event.EndTime != nil {
duration = event.EndTime.Sub(event.StartTime).String()
}
body := fmt.Sprintf(`
告警已解决:
规则名称: %s
指标名称: %s
严重程度: %s
开始时间: %s
结束时间: %s
持续时间: %s
`,
event.RuleName,
event.MetricName,
event.Severity,
event.StartTime.Format("2006-01-02 15:04:05"),
event.EndTime.Format("2006-01-02 15:04:05"),
duration)
return en.sendEmail(subject, body)
}
// 发送邮件
func (en *EmailNotifier) sendEmail(subject, body string) error {
// 这里实现SMTP邮件发送逻辑
// 为了简化,这里只是模拟
fmt.Printf("发送邮件: %s\n%s\n", subject, body)
return nil
}
// Webhook通知渠道
type WebhookNotifier struct {
url string
timeout time.Duration
client *http.Client
}
// 创建Webhook通知器
func NewWebhookNotifier(url string, timeout time.Duration) *WebhookNotifier {
return &WebhookNotifier{
url: url,
timeout: timeout,
client: &http.Client{
Timeout: timeout,
},
}
}
// 发送告警Webhook
func (wn *WebhookNotifier) SendAlert(event *AlertEvent) error {
payload := map[string]interface{}{
"type": "alert",
"rule_name": event.RuleName,
"metric_name": event.MetricName,
"metric_value": event.MetricValue,
"threshold": event.Threshold,
"severity": event.Severity,
"status": event.Status,
"message": event.Message,
"start_time": event.StartTime,
}
return wn.sendWebhook(payload)
}
// 发送解决Webhook
func (wn *WebhookNotifier) SendResolved(event *AlertEvent) error {
payload := map[string]interface{}{
"type": "resolved",
"rule_name": event.RuleName,
"metric_name": event.MetricName,
"severity": event.Severity,
"status": event.Status,
"start_time": event.StartTime,
"end_time": event.EndTime,
}
return wn.sendWebhook(payload)
}
// 发送Webhook请求
func (wn *WebhookNotifier) sendWebhook(payload interface{}) error {
data, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", wn.url, bytes.NewBuffer(data))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := wn.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return fmt.Errorf("webhook请求失败: %d", resp.StatusCode)
}
return nil
}