上次部署完了Prometheus这次让他实现告警功能
首先,先配置alertmanager配置文件
#当前目录
[root@master231 kube-prometheus-0.11.0]# pwd
/kubernetes/manifests/add-ons/kube-prometheus-0.11.0
#配置manifests/alertmanager-secret.yaml文件(就是alertmanager配置文件)
cat manifests/alertmanager-secret.yaml
apiVersion: v1
kind: Secret
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
"smtp_from": "xxx@qq.com" #发件人地址
"smtp_smarthost": "smtp.qq.com:465"
"smtp_auth_username": "xxx@qq.com" #SMTP 登录账号
"smtp_auth_password": "gyzgepedmtuthfba" #SMTP授权码
"smtp_require_tls": false
"smtp_hello": "qq.com"
"route":
"group_by":
- "alertname"
"group_wait": "5s"
"group_interval": "5s"
"repeat_interval": "5m"
"receiver": "sre_system"
"routes":
- "receiver": "zhiyang"
"match_re":
"job": "yinzhengjie_dba_exporter"
"continue": true
- "receiver": "wupeixin"
"match_re":
"job": "yinzhengjie_devops_exporter"
"continue": true
- "receiver": "wanglei"
"match_re":
"job": "oldboyedu-etcd-cluster"
"continue": true
- "receiver": "sre_system"
"match_re":
"job": ".*"
"continue": true
"receivers":
- "name": "xxx"
"email_configs":
- "to": "xxx@qq.com" #目标地址
"send_resolved": true
"headers":
"Subject": "[WARN] LINUX报警邮件"
"html": '{{ template "oldboyedu" . }}'
- "name": "xxx"
"email_configs":
- "to": "xxx@qq.com"#目标地址
"send_resolved": true
"headers":
"Subject": "[WARN] LINUX报警邮件"
"html": '{{ template "oldboyedu" . }}'
- "name": "xxx"
"email_configs":
- "to": "xxx@qq.com"#目标地址
"send_resolved": true
"headers":
"Subject": "[WARN] LINUX报警邮件"
"html": '{{ template "oldboyedu" . }}'
- "name": "sre_system"
"email_configs":
- "to": "xxx@qq.com"#目标地址
"send_resolved": true
"headers":
"Subject": "[WARN] LINUX报警邮件"
"html": '{{ template "oldboyedu" . }}'
"templates":
- "/oldboyedu/softwares/alertmanager/tmpl/*.tmpl"
type: Opaque创建cm资源(模板) 模板文件
cat linux.tmpl
{{ define "Linux" }}
<!DOCTYPE html>
<html>
<head>
<title>{{ if eq .Status "firing" }}🚨 告警触发{{ else }}✅ 告警恢复{{ end }}</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<style>
@font-face {
font-family: "EmojiFont";
src: local("Apple Color Emoji"),
local("Segoe UI Emoji"),
local("Noto Color Emoji");
}
:root {
--color-critical: #ff4444;
--color-warning: #ffbb33;
--color-resolved: #00c851;
--color-info: #33b5e5;
}
body {
font-family: 'Segoe UI', system-ui, sans-serif, "EmojiFont";
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 20px auto;
padding: 0 20px;
}
.header {
text-align: center;
padding: 30px;
border-radius: 15px;
margin-bottom: 30px;
background: {{ if eq .Status "firing" }}#fff0f0{{ else }}#f0fff4{{ end }};
border: 2px solid {{ if eq .Status "firing" }}var(--color-critical){{ else }}var(--color-resolved){{ end }};
}
.status-badge {
padding: 8px 16px;
border-radius: 20px;
font-weight: bold;
display: inline-block;
}
.alert-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
background: white;
border-radius: 10px;
overflow: hidden;
box-shadow: 0 2px 6px rgba(0,0,0,0.1);
margin: 20px 0;
}
.alert-table th {
background: #f8f9fa;
padding: 16px;
text-align: left;
width: 130px;
border-right: 2px solid #e9ecef;
}
.alert-table td {
padding: 16px;
border-bottom: 1px solid #e9ecef;
}
.timeline {
display: flex;
justify-content: space-between;
margin: 15px 0;
}
.timeline-item {
flex: 1;
text-align: center;
padding: 10px;
background: #f8f9fa;
border-radius: 8px;
margin: 0 5px;
}
.alert-image {
text-align: center;
margin: 30px 0;
}
.alert-image img {
width: {{ if eq .Status "firing" }}140px{{ else }}100px{{ end }};
opacity: 0.9;
transition: all 0.3s ease;
}
.emoji {
font-family: "EmojiFont", sans-serif;
font-size: 1.3em;
}
.severity-critical { color: var(--color-critical); }
.severity-warning { color: var(--color-warning); }
</style>
</head>
<body>
<div class="header">
<h1>
{{ if eq .Status "firing" }}
<span class="emoji">🚨</span> 告警触发通知
{{ else }}
<span class="emoji">✅</span> 告警恢复通知
{{ end }}
</h1>
</div>
{{ if eq .Status "firing" }}
<!-- 告警触发内容 -->
<table class="alert-table">
<tr>
<th><span class="emoji">🚫</span> 告警名称</th>
<td>{{ range .Alerts }}<span class="emoji">📣</span> {{ .Labels.alertname }}{{ end }}</td>
</tr>
<tr>
<th><span class="emoji">⚠️</span> 严重等级</th>
<td class="severity-{{ range .Alerts }}{{ .Labels.severity }}{{ end }}">
{{ range .Alerts }}<span class="emoji">⏺</span> {{ .Labels.severity | toUpper }}{{ end }}
</td>
</tr>
<tr>
<th><span class="emoji">🕚</span> 触发时间</th>
<td>{{ range .Alerts }}<span class="emoji">🕑</span> {{ .StartsAt.Format "2006-01-02 15:04:05" }}{{ end }}</td>
</tr>
</table>
{{ else }}
<!-- 告警恢复内容 -->
<table class="alert-table">
<tr>
<th><span class="emoji">📣</span> 恢复告警</th>
<td>{{ range .Alerts }}<span class="emoji">🔐</span> {{ .Labels.alertname }}{{ end }}</td>
</tr>
<tr>
<th><span class="emoji">⏳</span> 持续时间</th>
<td>
{{ range .Alerts }}
{{ .StartsAt.Format "15:04:05" }} - {{ .EndsAt.Format "15:04:05" }}
({{ .EndsAt.Sub .StartsAt | printf "%.0f" }} 分钟)
{{ end }}
</td>
</tr>
<tr>
<th><span class="emoji">✅</span> 恢复时间</th>
<td>{{ range .Alerts }}<span class="emoji">🕒</span> {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}</td>
</tr>
</table>
{{ end }}
<!-- 公共信息部分 -->
<table class="alert-table">
<tr>
<th><span class="emoji">💻️</span> 实例信息</th>
<td>{{ range .Alerts }}<span class="emoji">🏷</span> {{ .Labels.instance }}{{ end }}</td>
</tr>
<tr>
<th><span class="emoji">📝</span> 告警详情</th>
<td>{{ range .Alerts }}<span class="emoji">📌</span> {{ .Annotations.summary }}{{ end }}</td>
</tr>
<tr>
<th><span class="emoji">📄</span> 详细描述</th>
<td>{{ range .Alerts }}<span class="emoji">📑</span> {{ .Annotations.description }}{{ end }}</td>
</tr>
</table>
<div class="alert-image">
{{ if eq .Status "firing" }}
<img src="https://img95.699pic.com/element/40114/9548.png_860.png" alt="告警图标">
{{ else }}
<img src="https://tse2-mm.cn.bing.net/th/id/OIP-C.n7AyZv_wWXqFCc1mtlGhFgHaHa?rs=1&pid=ImgDetMain" alt="恢复图标">
{{ end }}
</div>
<div class="timeline">
<div class="timeline-item">
<div class="emoji">🚦 当前状态</div>
{{ range .Alerts }}
<strong>{{ if eq .Status "firing" }}<span class="emoji">🔥</span> FIRING{{ else }}<span class="emoji">✅</span> RESOLVED{{ end }}</strong>
{{ end }}
</div>
<div class="timeline-item">
<div class="emoji">📌 触发次数</div>
<strong>{{ len .Alerts }} 次</strong>
</div>
</div>
</body>
</html>
{{ end }}启用
kubectl create configmap cm-alertmanager -n monitoring --from-file=oldboyedu.tmpl=oldboyedu.tmplmanifests/alertmanager-alertmanager.yaml 引用上面Secret资源(alertmanager配置文件)和cm资源(模板文件)
cat manifests/alertmanager-alertmanager.yaml
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: main
namespace: monitoring
spec:
configSecret: alertmanager-main # 添加这行引用 Secret
volumes:
- name: data
configMap:
name: cm-alertmanager
items:
- key: linux.tmpl
path: linux.tmpl
volumeMounts:
- name: data
mountPath: /linux/softwares/alertmanager/tmpl
image: quay.io/prometheus/alertmanager:v0.24.0
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: 0.24.0配置告警规则 这里我用的那个自定义goapi那个进行告警
cat manifests/prometheus-prometheusRule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.36.1
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
namespace: monitoring
spec:
groups:
- name: prometheus
rules:
# 这里可以添加你的新规则
- alert: ApplicationLoginAPIHigh
expr: yinzhengjie_application_login_api > 30
for: 2m
labels:
severity: warning
team: application
annotations:
summary: "应用登录API调用次数过高"
description: "应用登录API调用次数当前值为 {{ $value }},超过阈值 20"应用所有资源
kubectl apply -f manifests/
#如果不行就
kubectl apply --server-side -f manifests/setup
kubectl wait --for condition=Established --all CustomResourceDefinition --namespace=monitoring
kubectl apply -f manifests/展示效果
