mysql8.0
redis7.x
golang1.23.4
#建库
CREATE BATABASE bigagent;
USE bigagent;
#导入server仓库中的sql
source bigagent.sqlgit clone https://gitee.com/yl166490/bigagentt.git
go mod tidygo env -w GOOS=linux
go build -o "bigagent"
chmod 755 bigagent配置文件修改
# 是否开启守卫模式
guarder: false
system:
# http暴露套接字
addr: 0.0.0.0:8010
# grpc暴露ip
grpc: 0.0.0.0
# grpc暴露端口
grpc_port: 5678
# bigagent-server的grpc套接字
grpc_server:
# grpc_server: 192.168.0.83:8765
# 日志文件路径
logfile: log.txt
# api与push的bear的token
serct: "123456"
# 是否开启api服务,0为不开启,1为开启
api : 0
# osqueryd的套接字文件路径
# empath: "/var/osquery/osquery.em"
empath: "\\\\.\\\\pipe\\\\osquery.em"
# 全局类型设置
global:
# 指定主机类型(virtual、physical、hwy、aly)
machineType: "virtual"
# 指定主机所属的环境信息(offline、prod)
env: "offline"
# 维易cmdb配置
veops:
#address:
address: "http://192.168.210.108:8000"
key: "1afef87ae36a4942a14b16a82033fbcb"
secret: "mQ3l7GTIB0@Ua4Rpqzx?!HfiWcusorD#"
ciType: "guochu_auto_machine_v2"
# 多个k8s集群的kubeconfig路径map :key是集群名字,value 文件路径
k8s_configs:
cpu-compute-01: kubeconfig1.yaml
gpu-compute-01:
k8s_compute_prom_addr_map:
cpu-compute-01: http://192.168.210.108:9090
gpu-compute-01: http://192.168.210.108:9090
mysql_info:
name: deploy # 跟真实的库没关系,只是我们代码中标识:区别多个库
addr: "root:123456@tcp(127.0.0.1:3306)/bigagent?charset=utf8&parseTime=True"
max: 128 # 最大连接数
idel: 16 # 空闲连接
debug: false # 是不是要打印sql。对于拼接sql很重要
im_ding_ding:
bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
title: "[bigagent守卫通知]"
atMobiles:
- 15810947075
plugin_ntp:
enable: true
check_interval_seconds: 60 # 多久触发检查
query_prom_time_out_seconds: 5 # 查询prometheus 超时
cordon_daily_limit: 1 # 每日cordon 保护措施节点数量
check_ql: |-
avg by (node)(avg_over_time(node_ntp_offset_seconds[24h])) > -1
# 需要写成针对单一节点的
recovery_ql: |-
node_ntp_stratum{node="%s"}==3
enabled_clusters: # 在哪些集群上开启这个模块
cpu-compute-01: yes
im_ding_ding:
bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
title: "[bigagent守卫通知| ntp模块]"
atMobiles:
- 15810947075
recovery_conf:
enable: true
check_interval_seconds: 60 # 多久触发检查
query_prom_time_out_seconds: 5 # 查询prometheus 超时
check_day_num: 5
im_ding_ding:
bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
title: "[bigagent守卫通知| node自愈模块]"
atMobiles:
- 15810947075
common_module:
enable: true
check_interval_seconds: 60 # 多久触发检查
query_prom_time_out_seconds: 5 # 查询prometheus 超时
cordon_daily_limit_map: # 每日cordon 保护措施节点数量
file_system_read_only: 1
arp_too_many: 1
check_ql_map:
file_system_read_only: node_filesystem_readonly{mountpoint="/"}==1
arp_too_many: node_arp_entries{device="ens33"} >2
# 查询 node_arp_entries{device="ens33"} > 2
#
# Prometheus 会扫描所有符合标签 device="ens33" 的时间序列。
#
# 对每个时间序列,它会判断当前值是否 大于 2。
#
# 如果满足条件,则 这个时间序列会被返回,值保持原来的数值。
#
# 如果不满足条件,则 时间序列不会被返回(不会显示 0 或 false)。
recovery_ql_map:
file_system_read_only: |-
node_filesystem_readonly{mountpoint="/",node="%s"}>0
arp_too_many: |-
node_arp_entries{device="ens33",node="%s"} >2
enabled_clusters: # 在哪些集群上开启这个模块
cpu-compute-01: yes
#gpu-compute-01: yes
im_ding_ding:
bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
title: "[bigagent守卫通知| 通用模块]"
atMobiles:
- 15810947075
abnormal_pod_clean:
enable: true
check_interval_seconds: 60
double_check_sec_seconds: 5
label_selector: "app.kubernetes.io/name=juicefs-mount"
field_selector: "status.phase!=Running"
enabled_clusters:
cpu-compute-01: yes
im_ding_ding:
bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
title: "[bigagent守卫通知| 异常pod清理模块]"
atMobiles:
- 15810947075
node_down:
enable: true
check_interval_seconds: 150
query_prom_time_out_seconds: 20
enabled_clusters:
cpu-compute-01: yes
node_name_to_ip_ql: |-
node_os_info{node="%s"}
check_qls:
- avg_over_time(up{job="kubernetes-nodes-cadvisor"}[1d])==0
- avg_over_time(kube_node_status_condition{condition="Ready",status="unknown"}[1d])==1
- avg_over_time(up{job="kubernetes-nodes-kubelet"}[1d])==0
im_ding_ding:
bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
title: "[bigagent守卫通知| 节点宕机模块]"
atMobiles:
- 15810947075
#如下配置为自动生成,请勿修改!!!
grpc_cmdb1_stand1:
grpc_cmdb1_stand1_token:
grpc_cmdb2_stand1:
grpc_cmdb2_stand1_token:
grpc_cmdb3_stand1:
grpc_cmdb3_stand1_token:
grpc_cmdb1_stand2:
grpc_cmdb1_stand2_token:
grpc_cmdb2_stand2:
grpc_cmdb2_stand2_token:
grpc_cmdb3_stand2:
grpc_cmdb3_stand2_token:
grpc_cmdb1_stand3:
grpc_cmdb1_stand3_token:
grpc_cmdb2_stand3:
grpc_cmdb2_stand3_token:
grpc_cmdb3_stand3:
grpc_cmdb3_stand3_token:
action_detail: '33'
collection_frequency: "10s"
推荐后台启动方式:
./bigagent -d -s start
./bigagent -s restart
./bigagent -s stop
前台运行方式:
./bigagent
./bigagent -s start -c /path/config.yml如果你有好的意见或建议,欢迎给我们提 Issues 或 Pull Requests
