diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f9dd468 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,65 @@ +#FROM telegraf:1.0-alpine +FROM telegraf:1.0 + +# Reset to root user to do some installs +USER root + +# Install packages +RUN apt-get update && apt-get -y install \ + bash \ + curl \ + unzip \ + netcat-openbsd \ + && rm -rf /var/cache/apt/* /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Add ContainerPilot and its configuration +# Releases at https://github.com/joyent/containerpilot/releases +ENV CONTAINERPILOT_VER 2.3.0 +ENV CONTAINERPILOT file:///etc/containerpilot.json + +RUN export CONTAINERPILOT_CHECKSUM=ec9dbedaca9f4a7a50762f50768cbc42879c7208 \ + && curl --retry 7 --fail -Lso /tmp/containerpilot.tar.gz \ + "https://github.com/joyent/containerpilot/releases/download/${CONTAINERPILOT_VER}/containerpilot-${CONTAINERPILOT_VER}.tar.gz" \ + && echo "${CONTAINERPILOT_CHECKSUM} /tmp/containerpilot.tar.gz" | sha1sum -c \ + && tar zxf /tmp/containerpilot.tar.gz -C /usr/local/bin \ + && rm /tmp/containerpilot.tar.gz + +# The our helper/glue scripts and configuration for this specific app +COPY bin /usr/local/bin +COPY etc /etc + +# Install Consul +# Releases at https://releases.hashicorp.com/consul +RUN export CONSUL_VERSION=0.6.4 \ + && export CONSUL_CHECKSUM=abdf0e1856292468e2c9971420d73b805e93888e006c76324ae39416edcf0627 \ + && curl --retry 7 --fail -vo /tmp/consul.zip "https://releases.hashicorp.com/consul/${CONSUL_VERSION}/consul_${CONSUL_VERSION}_linux_amd64.zip" \ + && echo "${CONSUL_CHECKSUM} /tmp/consul.zip" | sha256sum -c \ + && unzip /tmp/consul -d /usr/local/bin \ + && rm /tmp/consul.zip \ + && mkdir /config + +# Create empty directories for Consul config and data +RUN mkdir -p /etc/consul \ + && chown -R root /etc/consul \ + && mkdir -p /var/lib/consul \ + && chown -R root /var/lib/consul + +# Install Consul template +# Releases at https://releases.hashicorp.com/consul-template/ +RUN export CONSUL_TEMPLATE_VERSION=0.14.0 \ + && export CONSUL_TEMPLATE_CHECKSUM=7c70ea5f230a70c809333e75fdcff2f6f1e838f29cfb872e1420a63cdf7f3a78 \ + && curl --retry 7 --fail -Lso /tmp/consul-template.zip "https://releases.hashicorp.com/consul-template/${CONSUL_TEMPLATE_VERSION}/consul-template_${CONSUL_TEMPLATE_VERSION}_linux_amd64.zip" \ + && echo "${CONSUL_TEMPLATE_CHECKSUM} /tmp/consul-template.zip" | sha256sum -c \ + && unzip /tmp/consul-template.zip -d /usr/local/bin \ + && rm /tmp/consul-template.zip + +# Reset entrypoint from base image +ENTRYPOINT [] + +# Run telegraf +USER root +CMD ["/usr/local/bin/containerpilot", \ + "/entrypoint.sh", \ + "telegraf", \ + "-config", \ + "/etc/telegraf.conf"] diff --git a/README.md b/README.md index fddc0ec..2cdad8b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,94 @@ # telegraf Work in progress, not stable, expect force pushes of this repo + +containerized telegraf server, based on the official telegraf/1.0 Docker image, adding [ContainerPilot](https://www.joyent.com/containerpilot) to announce this container's telegraf service to a Service Discovery layer, such as Consul or etcd. + +### Usage +Include this image in your Docker Compose project, query Consul for it's IP address and use it in your configurations, easily done via [Consul-Template](https://github.com/hashicorp/consul-template). The default ContainerPilot configuration talks to Consul and assumes the IP address to access consul is passed to the container in an envrionment varible, $CONSUL (or via docker link consul) + +Configuration of telegraf is managed via ContainerPilot `preStart` or `onChange` handlers. + +Telegraf output is convigured with InfluxDB output plugin. By default telegraf is looking for InfluxDB container started in the same cluster, but it's possible to point Telegraf to remove InfluxDB server by uncommenting and setting up INFLUXDB_HOST variable in env.telegraf file + +Telegraf input sources configured with prometheus input plugin and represent a list of urls pointing to container-pilot telemetry endpoints (http://container-ip:9090/metrics). Input sources reloaded automatically with `onChange` event handler. + +### Configuration + +Please run setup.sh to generate required _env file and configure CONSUL env variable. + +You can also check _env.telegraf file. By setting INFLUXDB_HOST variable there you can point telegraf to already running instance of InfluxDB (you have to remove influxdb section from docker-compose.yml in this case). + +### Hello world example + +1. [Get a Joyent account](https://my.joyent.com/landing/signup/) and [add your SSH key](https://docs.joyent.com/public-cloud/getting-started). +1. Install the [Docker Toolbox](https://docs.docker.com/installation/mac/) (including `docker` and `docker-compose`) on your laptop or other environment, as well as the [Joyent Triton CLI](https://www.joyent.com/blog/introducing-the-triton-command-line-tool) (`triton` replaces our old `sdc-*` CLI tools). +1. [Configure Docker and Docker Compose for use with Joyent.](https://docs.joyent.com/public-cloud/api-access/docker) + +Check that everything is configured correctly by running `./setup.sh`. This will check that your environment is setup correctly and will create an `_env` file that includes injecting an environment variable for the Consul hostname into the Telegraf and Nginx containers so we can take advantage of [Triton Container Name Service (CNS)](https://www.joyent.com/blog/introducing-triton-container-name-service). + +Start everything: + +```bash +docker-compose build +docker-compose up -d +``` +In result we'll have 4 containers running: +- consul +- telegraf_nginx_1 - nginx web-server is used just for demo purposes to scale and provide telemetry +- influxdb - currently running locally, but it's possible to connect with existing influxdb server +- telegraf + +To verify telegraf container status you can check container log (there should be a list of records, which indicate attempts to join new input source): +```bash +docker logs telegraf 2>&1 | grep EventMemberJoin +``` +it should display a list of members(input sources) recently added. + +Also you check the list of input source urls for telemetry currently used by telegraf with the following command: +```bash +docker exec -i -t telegraf /bin/grep :9090 /etc/telegraf.conf +``` +the list of urls includes consul container(first one in outout), telegraf container(localhost) and all other urls are nginx-container urls. +So you can check the number of urls in output, substruct 2 and it should give you a number of nginx containers + + +Lets scale up number of nginx containers to 3, wait for 15 seconds (give some time to telegraf to reconfigure itself) and check the number of input urls (or EventMemberJoin events in logs) +```bash +docker-compose scale nginx=3 +sleep 15 + +# check source urls +docker exec -i -t telegraf /bin/grep :9090 /etc/telegraf.conf + +# check logs +docker logs telegraf 2>&1 | grep EventMemberJoin +``` + +Lets scale down number of nginx containers to 1, wait for 15 seconds and check the number of input urls again: +```bash +docker-compose scale nginx=1 +sleep 15 + +# check source urls +docker exec -i -t telegraf /bin/grep :9090 /etc/telegraf.conf +``` + +Finally you can check actual result of telemery aggregation(via telegraf) on InfluxDB server. +You have to open InfluxDB UI with the following command: +```bash +open "http://$(triton ip influxdb):8083/" +``` +choose 'telegraf' database in dropdown located on the top-right corner, type and execute a query +``` +SHOW MEASUREMENTS +``` +there should be a record like 'nginx_connections_load' which represents data coming from nginx telemetry. +And the following query should display a list of nginx specfic telemetry recorods collected during last 5 minutes: +``` +SELECT * FROM nginx_connections_load WHERE time > now() - 5m +``` + +At the end of test you can shutdown containers with the following command: +``` +docker-compose kill +``` diff --git a/_env.telegraf b/_env.telegraf new file mode 100644 index 0000000..f2ed4de --- /dev/null +++ b/_env.telegraf @@ -0,0 +1,5 @@ +# InfluxDB settings +# (uncomment to change default influxdb host/database to custom url) +#INFLUXDB_HOST=influxdb # docker alias or real hostname +#INFLUXDB_DATABASE=telegraf +#INFLUXDB_DATA_ENGINE=tsm1 diff --git a/bin/reload.sh b/bin/reload.sh new file mode 100755 index 0000000..2d5c77c --- /dev/null +++ b/bin/reload.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +SERVICE_NAME=${SERVICE_NAME:-telegraf} +CONSUL=${CONSUL:-consul} + +# Render Telegraf configuration template using values from Consul, +# but do not reload because Telegraf has't started yet +preStart() { + # sleep 5 # give some time for other containerpilots to start before rendering config + consul-template \ + -once \ + -dedup \ + -consul ${CONSUL}:8500 \ + -template "/etc/telegraf.ctmpl:/etc/telegraf.conf" +} + +# Render Telegraf configuration template using values from Consul, +# then gracefully reload Telegraf +onChange() { + consul-template \ + -once \ + -dedup \ + -consul ${CONSUL}:8500 \ + -template "/etc/telegraf.ctmpl:/etc/telegraf.conf:/usr/local/bin/reload.sh reloadConfig" +} + +# Telegraf reload th SIGHUP +# Note: if we fire SIGHUP vs node before it has a chance to register the +# signal handler, then it will immediately exit. This ensures that +# the process is listening on port 8094 which should only be the +# case after we have the signal handler loaded. +reloadConfig() { + while : + do + netstat -ln | grep -q 8094 && pkill -SIGHUP telegraf && break + done +} +help() { + echo "Usage: ./reload.sh preStart => first-run configuration for Telegraf" + echo " ./reload.sh onChange => [default] update Telegraf config on upstream changes" + echo " ./reload.sh reloadConfig => reload Telegraf config on upstream changes" +} + +until + cmd=$1 + if [ -z "$cmd" ]; then + onChange + fi + shift 1 + $cmd "$@" + [ "$?" -ne 127 ] +do + onChange + exit +done diff --git a/bin/sensor.sh b/bin/sensor.sh new file mode 100755 index 0000000..6c1d595 --- /dev/null +++ b/bin/sensor.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +help() { + echo 'Uses cli tools free and top to determine current CPU and memory usage' + echo 'for the telemetry service.' +} + +# memory usage in percent +sys_memory() { + # awk oneliner to get memory usage + # free -m | awk 'NR==2{printf "Memory Usage: %s/%sMB (%.2f%%)\n", $3,$2,$3*100/$2 }' + # output: + # Memory Usage: 15804/15959MB (99.03%) + (>&2 echo "sys memory check fired") + local memory=$(free -m | awk 'NR==2{printf "%.2f", $3*100/$2 }') + echo ${memory} +} + +# cpu load +sys_cpu() { + # oneliner to display cpu load + # top -bn1 | grep load | awk '{printf "CPU Load: %.2f\n", $(NF-2)}' + (>&2 echo "sys cpu check fired") + local cpuload=$(top -bn1 | grep load | awk '{printf "%.2f", $(NF-2)}') + echo ${cpuload} +} + +cmd=$1 +if [ ! -z "$cmd" ]; then + shift 1 + $cmd "$@" + exit +fi + +help diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..022d65a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,78 @@ +# Telegraf autopilotpattern demo + +# Consul - start with a single host which will bootstrap the cluster. +# In production we'll want to use an HA cluster. +consul: + container_name: consul + image: progrium/consul:latest + restart: always + mem_limit: 128m + expose: + - 53 + - 8300 + - 8301 + - 8302 + - 8400 + - 8500 + ports: + - 8500 # expose only Consul's UI on the public IP + dns: + - 127.0.0.1 + labels: + - triton.cns.services=consul + command: -server -bootstrap -ui-dir /ui + +# Telegraf should autodiscovery all autopilot containers, +# collect telemetry from there, and send output to influxdb. +telegraf: + container_name: telegraf + #image: autopilotpattern/telegraf + build: . # for now use local image build + mem_limit: 128m + expose: + - 8094 # necessary for healthcheck + - 9090 # so we can see telemetry + restart: always + links: + - consul:consul + - influxdb:influxdb + environment: + - CONSUL_AGENT=1 + - CONSUL=consul + - INFLUXDB=influxdb + env_file: + - _env + - _env.telegraf + labels: + - triton.cns.services=telegraf + +# InfluxDB +influxdb: + container_name: influxdb + image: influxdb:latest + mem_limit: 512m + ports: + - "8083:8083" + - "8086:8086" + - "8090:8090" + env_file: + - '_env' + +# Ngix included for demonstration of input sources for telegraf +# we can scale it and demonstrate auto capturing of nginx containers telemetry by telegraf +nginx: + image: autopilotpattern/nginx + restart: always + mem_limit: 128m + ports: + - 80 # http port + - 9090 # so we can see telemetry + links: + - consul:consul + environment: + - CONSUL_AGENT=1 + - CONSUL=consul + - BACKEND=consul # backend is required, so lets point to consul just for demo purposes + env_file: _env + labels: + - triton.cns.services=nginx diff --git a/etc/containerpilot.json b/etc/containerpilot.json new file mode 100644 index 0000000..154cbde --- /dev/null +++ b/etc/containerpilot.json @@ -0,0 +1,50 @@ +{ + "consul": "{{ if .CONSUL_AGENT }}localhost{{ else }}{{ .CONSUL }}{{ end }}:8500", + "preStart": "/usr/local/bin/reload.sh preStart", + "services": [ + { + "name": "telegraf", + "port": 8094, + "health": "nc -vz localhost 8094", + "poll": 10, + "ttl": 25 + } + ], + "backends": [ + { + "name": "nginx", + "poll": 7, + "onChange": "/usr/local/bin/reload.sh onChange" + } + ], + "coprocesses": [{{ if .CONSUL_AGENT }} + { + "command": ["/usr/local/bin/consul", "agent", + "-data-dir=/var/lib/consul", + "-config-dir=/etc/consul", + "-rejoin", + "-retry-join", "{{ .CONSUL }}", + "-retry-max", "10", + "-retry-interval", "10s"], + "restarts": "unlimited" + }{{ end }}], + "telemetry": { + "port": 9090, + "sensors": [ + { + "name": "telegraf_sys_memory_percent", + "help": "percentage of memory used", + "type": "gauge", + "poll": 5, + "check": ["/usr/local/bin/sensor.sh", "sys_memory"] + }, + { + "name": "telegraf_sys_cpu_load", + "help": "cpu load", + "type": "gauge", + "poll": 5, + "check": ["/usr/local/bin/sensor.sh", "sys_cpu"] + } + ] + } +} diff --git a/etc/telegraf.conf b/etc/telegraf.conf new file mode 100644 index 0000000..52a5815 --- /dev/null +++ b/etc/telegraf.conf @@ -0,0 +1,215 @@ +# Telegraf Configuration + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at + ## most metric_batch_size metrics. + metric_batch_size = 1000 + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default, precision will be set to the same timestamp order as the + ## collection interval, with the maximum being 1s. + ## Precision will NOT be used for service inputs, such as logparser and statsd. + ## Valid values are "ns", "us" (or "µs"), "ms", "s". + precision = "" + ## Run telegraf in debug mode + debug = false + ## Run telegraf in quiet mode + quiet = false + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + ## The full HTTP or UDP endpoint URL for your InfluxDB instance. + ## Multiple urls can be specified as part of the same cluster, + ## this means that only ONE of the urls will be written to each interval. + # urls = ["udp://localhost:8089"] # UDP endpoint example + urls = ["http://influxdb:8086"] # required (default) + + ## The target database for metrics (telegraf will create it if not exists). + database = "telegraf" # required + + ## Retention policy to write to. Empty string writes to the default rp. + retention_policy = "" + ## Write consistency (clusters only), can be: "any", "one", "quorum", "all" + write_consistency = "any" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + ## Set the user agent for HTTP POSTs (can be useful for log differentiation) + # user_agent = "telegraf" + ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) + # udp_payload = 512 + + ## Optional SSL Config + # ssl_ca = "/etc/telegraf/ca.pem" + # ssl_cert = "/etc/telegraf/cert.pem" + # ssl_key = "/etc/telegraf/key.pem" + ## Use SSL but skip chain & host verification + # insecure_skip_verify = false + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## Comment this line if you want the raw CPU time metrics + fielddrop = ["time_*"] + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + +# # Generic TCP listener +[[inputs.tcp_listener]] + ## Address and port to host TCP listener on + service_address = ":8094" +# +# ## Number of TCP messages allowed to queue up. Once filled, the +# ## TCP listener will start dropping packets. + allowed_pending_messages = 10000 +# +# ## Maximum number of concurrent TCP connections to allow + max_tcp_connections = 250 +# +# ## Data format to consume. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + + +# # Generic UDP listener +[[inputs.udp_listener]] + ## Address and port to host UDP listener on + service_address = ":8092" +# +# ## Number of UDP messages allowed to queue up. Once filled, the +# ## UDP listener will start dropping packets. + allowed_pending_messages = 10000 +# +# ## Data format to consume. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Most of these values defaults to the one configured on a Consul's agent level. +# ## Optional Consul server address (default: "localhost") +# # address = "localhost" +# ## Optional URI scheme for the Consul server (default: "http") +# # scheme = "http" +# ## Optional ACL token used in every request (default: "") +# # token = "" +# ## Optional username used for request HTTP Basic Authentication (default: "") +# # username = "" +# ## Optional password used for HTTP Basic Authentication (default: "") +# # password = "" +# ## Optional data centre to query the health checks from (default: "") +# # datacentre = "" + +# # Read metrics from one or many prometheus clients +[[inputs.prometheus]] + ## An array of urls to scrape metrics from. + urls = ["http://localhost:9090/metrics"] # default + +# ## Use bearer token for authorization +# # bearer_token = /path/to/bearer/token +# +# ## Optional SSL Config +# # ssl_ca = /path/to/cafile +# # ssl_cert = /path/to/certfile +# # ssl_key = /path/to/keyfile +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false diff --git a/etc/telegraf.ctmpl b/etc/telegraf.ctmpl new file mode 100644 index 0000000..c4d0fc3 --- /dev/null +++ b/etc/telegraf.ctmpl @@ -0,0 +1,229 @@ +# Telegraf Configuration + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at + ## most metric_batch_size metrics. + metric_batch_size = 1000 + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default, precision will be set to the same timestamp order as the + ## collection interval, with the maximum being 1s. + ## Precision will NOT be used for service inputs, such as logparser and statsd. + ## Valid values are "ns", "us" (or "µs"), "ms", "s". + precision = "" + ## Run telegraf in debug mode + debug = false + ## Run telegraf in quiet mode + quiet = false + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + ## The full HTTP or UDP endpoint URL for your InfluxDB instance. + ## Multiple urls can be specified as part of the same cluster, + ## this means that only ONE of the urls will be written to each interval. + # urls = ["udp://localhost:8089"] # UDP endpoint example + {{ $influxdbhost := env "INFLUXDB_HOST" }} + {{ if $influxdbhost }} + urls = ["http://{{ $influxdbhost }}:8086"] # required + {{ else }} + urls = ["http://influxdb:8086"] # required + {{ end }} + + ## The target database for metrics (telegraf will create it if not exists). + {{ $influxdbdatabase := env "INFLUXDB_DATABASE" }} + {{ if $influxdbdatabase }} + database = "{{ $influxdbdatabase }}" # required + {{else}} + database = "telegraf" # required (default) + {{ end }} + + ## Retention policy to write to. Empty string writes to the default rp. + retention_policy = "" + ## Write consistency (clusters only), can be: "any", "one", "quorum", "all" + write_consistency = "any" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + ## Set the user agent for HTTP POSTs (can be useful for log differentiation) + # user_agent = "telegraf" + ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) + # udp_payload = 512 + + ## Optional SSL Config + # ssl_ca = "/etc/telegraf/ca.pem" + # ssl_cert = "/etc/telegraf/cert.pem" + # ssl_key = "/etc/telegraf/key.pem" + ## Use SSL but skip chain & host verification + # insecure_skip_verify = false + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## Comment this line if you want the raw CPU time metrics + fielddrop = ["time_*"] + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + +# # Generic TCP listener +[[inputs.tcp_listener]] + ## Address and port to host TCP listener on + service_address = ":8094" +# +# ## Number of TCP messages allowed to queue up. Once filled, the +# ## TCP listener will start dropping packets. + allowed_pending_messages = 10000 +# +# ## Maximum number of concurrent TCP connections to allow + max_tcp_connections = 250 +# +# ## Data format to consume. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + + +# # Generic UDP listener +[[inputs.udp_listener]] + ## Address and port to host UDP listener on + service_address = ":8092" +# +# ## Number of UDP messages allowed to queue up. Once filled, the +# ## UDP listener will start dropping packets. + allowed_pending_messages = 10000 +# +# ## Data format to consume. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Most of these values defaults to the one configured on a Consul's agent level. +# ## Optional Consul server address (default: "localhost") +# # address = "localhost" +# ## Optional URI scheme for the Consul server (default: "http") +# # scheme = "http" +# ## Optional ACL token used in every request (default: "") +# # token = "" +# ## Optional username used for request HTTP Basic Authentication (default: "") +# # username = "" +# ## Optional password used for HTTP Basic Authentication (default: "") +# # password = "" +# ## Optional data centre to query the health checks from (default: "") +# # datacentre = "" + +# # Read metrics from one or many prometheus clients +[[inputs.prometheus]] + ## An array of urls to scrape metrics from. + {{ if service "containerpilot" }} + urls = [{{range $i, $e := service "containerpilot"}}{{if ne $i 0}}, {{end}}"http://{{$e.Address}}:{{$e.Port}}/metrics"{{end}}, "http://localhost:9090/metrics"] + {{ else }} + urls = ["http://localhost:9090/metrics"] + {{ end }} + +# ## Use bearer token for authorization +# # bearer_token = /path/to/bearer/token +# +# ## Optional SSL Config +# # ssl_ca = /path/to/cafile +# # ssl_cert = /path/to/certfile +# # ssl_key = /path/to/keyfile +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..a27107f --- /dev/null +++ b/setup.sh @@ -0,0 +1,129 @@ +#!/bin/bash +set -e -o pipefail + +help() { + echo 'Usage ./setup.sh [-f docker-compose.yml] [-p project]' + echo + echo 'Checks that your Triton and Docker environment is sane and configures' + echo 'an environment file to use.' + echo + echo 'Optional flags:' + echo ' -f use this file as the docker-compose config file' + echo ' -p use this name as the project prefix for docker-compose' +} + + +# default values which can be overriden by -f or -p flags +export COMPOSE_PROJECT_NAME=telegraf +export COMPOSE_FILE= + +# give the docker remote api more time before timeout +export COMPOSE_HTTP_TIMEOUT=300 + +# populated by `check` function whenever we're using Triton +TRITON_USER= +TRITON_DC= +TRITON_ACCOUNT= + +# --------------------------------------------------- +# Top-level commmands + + +# Check for correct configuration +check() { + + command -v docker >/dev/null 2>&1 || { + echo + tput rev # reverse + tput bold # bold + echo 'Docker is required, but does not appear to be installed.' + tput sgr0 # clear + echo 'See https://docs.joyent.com/public-cloud/api-access/docker' + exit 1 + } + command -v json >/dev/null 2>&1 || { + echo + tput rev # reverse + tput bold # bold + echo 'Error! JSON CLI tool is required, but does not appear to be installed.' + tput sgr0 # clear + echo 'See https://apidocs.joyent.com/cloudapi/#getting-started' + exit 1 + } + + # if we're not testing on Triton, don't bother checking Triton config + if [ ! -z "${COMPOSE_FILE}" ]; then + exit 0 + fi + + command -v triton >/dev/null 2>&1 || { + echo + tput rev # reverse + tput bold # bold + echo 'Error! Joyent Triton CLI is required, but does not appear to be installed.' + tput sgr0 # clear + echo 'See https://www.joyent.com/blog/introducing-the-triton-command-line-tool' + exit 1 + } + + # make sure Docker client is pointed to the same place as the Triton client + local docker_user=$(docker info 2>&1 | awk -F": " '/SDCAccount:/{print $2}') + local docker_dc=$(echo $DOCKER_HOST | awk -F"/" '{print $3}' | awk -F'.' '{print $1}') + TRITON_USER=$(triton profile get | awk -F": " '/account:/{print $2}') + TRITON_DC=$(triton profile get | awk -F"/" '/url:/{print $3}' | awk -F'.' '{print $1}') + TRITON_ACCOUNT=$(triton account get | awk -F": " '/id:/{print $2}') + if [ ! "$docker_user" = "$TRITON_USER" ] || [ ! "$docker_dc" = "$TRITON_DC" ]; then + echo + tput rev # reverse + tput bold # bold + echo 'Error! The Triton CLI configuration does not match the Docker CLI configuration.' + tput sgr0 # clear + echo + echo "Docker user: ${docker_user}" + echo "Triton user: ${TRITON_USER}" + echo "Docker data center: ${docker_dc}" + echo "Triton data center: ${TRITON_DC}" + exit 1 + fi + + local triton_cns_enabled=$(triton account get | awk -F": " '/cns/{print $2}') + if [ ! "true" == "$triton_cns_enabled" ]; then + echo + tput rev # reverse + tput bold # bold + echo 'Error! Triton CNS is required and not enabled.' + tput sgr0 # clear + echo + exit 1 + fi + + echo CONSUL=consul.svc.${TRITON_ACCOUNT}.${TRITON_DC}.cns.joyent.com >> _env +} + +# --------------------------------------------------- +# parse arguments + +while getopts "f:p:h" optchar; do + case "${optchar}" in + f) export COMPOSE_FILE=${OPTARG} ;; + p) export COMPOSE_PROJECT_NAME=${OPTARG} ;; + esac +done +shift $(expr $OPTIND - 1 ) + +until + cmd=$1 + if [ ! -z "$cmd" ]; then + shift 1 + $cmd "$@" + if [ $? == 127 ]; then + help + fi + exit + fi +do + echo +done + +# default behavior +check