在日常巡检过程当中,不需要登录服务器去查看,通过调用k8s api的方式获取所有pod的状态

然后在每天9点执行本脚本即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from kubernetes import client, config
from kubernetes.client.rest import ApiException
from datetime import datetime, timezone
import requests
import json
import sys
import pytz
# 加载 Kubernetes 配置

#默认会在master节点去读取 ~/.kube/config 文件
config.load_kube_config()

# 创建 Kubernetes API 客户端实例
api_instance = client.CoreV1Api()

# 指定要获取的命名空间列表
target_namespaces = [sys.argv[1]] # 替换为你的目标命名空间列表

filtered_keywords = ["mysql", "redis", "memcached", "postgres", "backend"]

# Discord Webhook URL
discord_webhook_url = "" # FOR TEST
# 替换为你的 Discord Webhook URL
containers_without_restart = [] # 没有重启原因的容器列表
containers_with_restart = [] # 具有重启原因的容器列表

try:
for target_namespace in target_namespaces:
# 获取命名空间下的所有 Pod
pods = api_instance.list_namespaced_pod(namespace=target_namespace).items

for pod in pods:
pod_name = pod.metadata.name
pod_status = pod.status.phase
pod_restart_reason = ""
pod_start_time = pod.metadata.creation_timestamp
if any(keyword in pod_name for keyword in filtered_keywords):
continue

# 获取当前时间
cst_timezone = pytz.timezone("Asia/Shanghai")
current_time = datetime.now(timezone.utc)

# 计算运行时长
if pod_start_time is not None:
pod_duration = current_time - pod_start_time
pod_duration_str = str(pod_duration).split(".")[0] # 格式化为字符串,去掉小数部分
else:
pod_duration_str = "Unknown"

# 检查 Pod 是否有重启记录
if pod.status.container_statuses is not None:
for container_status in pod.status.container_statuses:
restart_count = container_status.restart_count

# 如果重启次数大于 0,则获取重启原因
if restart_count > 0:
pod_restart_reason = container_status.last_state.terminated.reason

if pod_restart_reason:
containers_with_restart.append(
(pod_name, pod_status, pod_duration_str, pod_restart_reason, restart_count)
)
else:
containers_without_restart.append(
(pod_name, pod_status, pod_duration_str)
)

# 生成消息
message = "环境: {0} 获取时间:{1}\n\n".format(target_namespace,datetime.now(cst_timezone).strftime("%Y-%m-%d %H:%M:%S") + " CST")

# 添加没有重启原因的容器信息
for container in containers_without_restart:
pod_name, pod_status, pod_duration_str = container
message += "容器名称: {0} 当前状态: {1} 运行时长: {2}\n".format(
pod_name, pod_status, pod_duration_str
)
message += "------------------------------------------------------\n"
# 添加具有重启原因的容器信息
for container in containers_with_restart:
pod_name, pod_status, pod_duration_str, pod_restart_reason, restart_count = container
message += "容器名称: {0} 当前状态: {1} 运行时长: {2} 重启原因: {3} 重启次数:{4}\n".format(
pod_name, pod_status, pod_duration_str, pod_restart_reason, restart_count
)

message = "```{0}```".format(message)

# 发送消息到 Discord
payload = {"content": message}
headers = {"Content-Type": "application/json"}
response = requests.post(
discord_webhook_url, data=json.dumps(payload), headers=headers
)
print(response.content)
if response.status_code == 204:
print("Message sent to Discord successfully")
else:
print(f"Failed to send message to Discord. Status code: {response.status_code}")

except ApiException as e:
print(f"Exception when calling CoreV1Api: {e}\n")