Skip to content

yl1664907302/bigagent

Repository files navigation

准备

依赖

mysql8.0

redis7.x

golang1.23.4

建库与导库

#建库
CREATE BATABASE bigagent;
USE  bigagent;

#导入server仓库中的sql
source  bigagent.sql

功能总览

功能总览

Build

git clone https://gitee.com/yl166490/bigagentt.git
go mod tidy
go env -w GOOS=linux
go build -o "bigagent"
chmod 755 bigagent

配置文件修改

# 是否开启守卫模式
guarder: false
system:
  # http暴露套接字
  addr: 0.0.0.0:8010
  # grpc暴露ip
  grpc: 0.0.0.0
  # grpc暴露端口
  grpc_port: 5678
  # bigagent-server的grpc套接字
  grpc_server:
  #  grpc_server: 192.168.0.83:8765
  # 日志文件路径
  logfile: log.txt
  # api与push的bear的token
  serct: "123456"
  # 是否开启api服务,0为不开启,1为开启
  api : 0
  # osqueryd的套接字文件路径
  #  empath: "/var/osquery/osquery.em"
  empath: "\\\\.\\\\pipe\\\\osquery.em"

# 全局类型设置
global:
  # 指定主机类型(virtual、physical、hwy、aly)
  machineType:  "virtual"
  # 指定主机所属的环境信息(offline、prod)
  env:  "offline"

# 维易cmdb配置
veops:
  #address:
  address:  "http://192.168.210.108:8000"
  key:  "1afef87ae36a4942a14b16a82033fbcb"
  secret: "mQ3l7GTIB0@Ua4Rpqzx?!HfiWcusorD#"
  ciType: "guochu_auto_machine_v2"


# 多个k8s集群的kubeconfig路径map  :key是集群名字,value 文件路径
k8s_configs:
  cpu-compute-01: kubeconfig1.yaml
  gpu-compute-01:


k8s_compute_prom_addr_map:
  cpu-compute-01: http://192.168.210.108:9090
  gpu-compute-01: http://192.168.210.108:9090

mysql_info:
  name: deploy # 跟真实的库没关系,只是我们代码中标识:区别多个库
  addr: "root:123456@tcp(127.0.0.1:3306)/bigagent?charset=utf8&parseTime=True"
  max: 128 # 最大连接数
  idel: 16 # 空闲连接
  debug: false # 是不是要打印sql。对于拼接sql很重要

im_ding_ding:
  bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
  title: "[bigagent守卫通知]"
  atMobiles:
    - 15810947075

plugin_ntp:
  enable: true
  check_interval_seconds: 60   # 多久触发检查
  query_prom_time_out_seconds: 5 # 查询prometheus 超时
  cordon_daily_limit: 1      # 每日cordon 保护措施节点数量
  check_ql: |-
    avg by (node)(avg_over_time(node_ntp_offset_seconds[24h])) > -1
  # 需要写成针对单一节点的
  recovery_ql: |-
    node_ntp_stratum{node="%s"}==3
  enabled_clusters: # 在哪些集群上开启这个模块
    cpu-compute-01: yes
  im_ding_ding:
    bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
    title: "[bigagent守卫通知| ntp模块]"
    atMobiles:
      - 15810947075

recovery_conf:
  enable: true
  check_interval_seconds: 60   # 多久触发检查
  query_prom_time_out_seconds: 5 # 查询prometheus 超时
  check_day_num: 5
  im_ding_ding:
    bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
    title: "[bigagent守卫通知| node自愈模块]"
    atMobiles:
      - 15810947075


common_module:
  enable: true
  check_interval_seconds: 60   # 多久触发检查
  query_prom_time_out_seconds: 5 # 查询prometheus 超时


cordon_daily_limit_map: # 每日cordon 保护措施节点数量
  file_system_read_only: 1
  arp_too_many: 1



check_ql_map:
  file_system_read_only: node_filesystem_readonly{mountpoint="/"}==1
  arp_too_many: node_arp_entries{device="ens33"} >2
#  查询 node_arp_entries{device="ens33"} > 2
#
#  Prometheus 会扫描所有符合标签 device="ens33" 的时间序列。
#
#  对每个时间序列,它会判断当前值是否 大于 2。
#
#  如果满足条件,则 这个时间序列会被返回,值保持原来的数值。
#
#  如果不满足条件,则 时间序列不会被返回(不会显示 0 或 false)。

recovery_ql_map:
  file_system_read_only: |-
    node_filesystem_readonly{mountpoint="/",node="%s"}>0
  arp_too_many: |-
    node_arp_entries{device="ens33",node="%s"} >2

  enabled_clusters: # 在哪些集群上开启这个模块
    cpu-compute-01: yes
    #gpu-compute-01: yes
  im_ding_ding:
    bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
    title: "[bigagent守卫通知| 通用模块]"
    atMobiles:
      - 15810947075


abnormal_pod_clean:
  enable: true
  check_interval_seconds: 60
  double_check_sec_seconds: 5
  label_selector: "app.kubernetes.io/name=juicefs-mount"
  field_selector: "status.phase!=Running"
  enabled_clusters:
    cpu-compute-01: yes
  im_ding_ding:
    bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
    title: "[bigagent守卫通知| 异常pod清理模块]"
    atMobiles:
      - 15810947075


node_down:
  enable: true
  check_interval_seconds: 150
  query_prom_time_out_seconds: 20
  enabled_clusters:
    cpu-compute-01: yes
  node_name_to_ip_ql: |-
    node_os_info{node="%s"}
  check_qls:
    - avg_over_time(up{job="kubernetes-nodes-cadvisor"}[1d])==0
    - avg_over_time(kube_node_status_condition{condition="Ready",status="unknown"}[1d])==1
    - avg_over_time(up{job="kubernetes-nodes-kubelet"}[1d])==0
  im_ding_ding:
    bot_api_addr: https://oapi.dingtalk.com/robot/send?access_token=
    title: "[bigagent守卫通知| 节点宕机模块]"
    atMobiles:
      - 15810947075



#如下配置为自动生成,请勿修改!!!

grpc_cmdb1_stand1:
grpc_cmdb1_stand1_token:
grpc_cmdb2_stand1:
grpc_cmdb2_stand1_token:
grpc_cmdb3_stand1:
grpc_cmdb3_stand1_token:
grpc_cmdb1_stand2:
grpc_cmdb1_stand2_token:
grpc_cmdb2_stand2:
grpc_cmdb2_stand2_token:
grpc_cmdb3_stand2:
grpc_cmdb3_stand2_token:
grpc_cmdb1_stand3:
grpc_cmdb1_stand3_token:
grpc_cmdb2_stand3:
grpc_cmdb2_stand3_token:
grpc_cmdb3_stand3:
grpc_cmdb3_stand3_token:
action_detail: '33'
collection_frequency: "10s"

Run

推荐后台启动方式:
./bigagent  -d -s start
./bigagent   -s restart
./bigagent   -s stop

前台运行方式:
./bigagent 
./bigagent -s start -c /path/config.yml

Contributing

如果你有好的意见或建议,欢迎给我们提 Issues 或 Pull Requests

Partners

@李泽建@叶凌

About

面向主机与Kubernetes的轻量级守护 Agent,以优雅姿态完成多端多协议推送、采集、检测、决策与自愈。

Resources

License

Stars

Watchers

Forks

Packages

 
 
 

Contributors