diff --git a/README.md b/README.md index cd50b79e..7d678e4a 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,9 @@ global: max_idle_connections: 3 # Maximum amount of time a connection may be reused to any one target. Infinite by default. max_connection_lifetime: 10m + # Expose per-query `query_duration_seconds` and `query_rows_returned` gauges, labelled with the + # `query` name (and `target` in multi-target mode). Off by default to keep the metric surface stable. + enable_query_metrics: false # The target to monitor and the list of collectors to execute on it. target: @@ -217,6 +220,23 @@ Kubernetes-native ServiceMonitor automatically configures Prometheus for HTTPS s ## Miscellaneous +
+Per-query observability metrics + +When `global.enable_query_metrics` is set to `true`, every scrape emits two additional gauges per query +in the configuration: + +- `query_duration_seconds{query=""}` — wall-clock time the query took during the most + recent scrape, including row scanning. Emitted even when the query errors, so spikes preceding a + failure remain visible. +- `query_rows_returned{query=""}` — number of rows the database returned during the most + recent scrape. Errored or skipped rows are not counted. + +Both metrics inherit the same constant labels as `up` / `scrape_duration_seconds` (notably `target` in +multi-target / jobs mode), so they can be aggregated by target the same way. The feature is off by +default to keep the existing metric surface unchanged. +
+
Handling NULL values diff --git a/collector.go b/collector.go index 4668c85f..0ab7bbac 100644 --- a/collector.go +++ b/collector.go @@ -31,7 +31,7 @@ type collector struct { // NewCollector returns a new Collector with the given configuration and database. The metrics it creates will all have // the provided const labels applied. -func NewCollector(logContext string, cc *config.CollectorConfig, constLabels []*dto.LabelPair) (Collector, errors.WithContext) { +func NewCollector(logContext string, cc *config.CollectorConfig, constLabels []*dto.LabelPair, enableQueryMetrics bool) (Collector, errors.WithContext) { logContext = TrimMissingCtx(fmt.Sprintf(`%s,collector=%s`, logContext, cc.Name)) // Maps each query to the list of metric families it populates. @@ -53,7 +53,7 @@ func NewCollector(logContext string, cc *config.CollectorConfig, constLabels []* // Instantiate queries. queries := make([]*Query, 0, len(cc.Metrics)) for qc, mfs := range queryMFs { - q, err := NewQuery(logContext, qc, mfs...) + q, err := NewQuery(logContext, qc, constLabels, enableQueryMetrics, mfs...) if err != nil { return nil, err } diff --git a/config/global_config.go b/config/global_config.go index 9a12c074..2345d574 100644 --- a/config/global_config.go +++ b/config/global_config.go @@ -21,6 +21,8 @@ type GlobalConfig struct { MaxConns int `yaml:"max_connections" env:"MAX_CONNECTIONS"` // maximum number of open connections to any one target MaxIdleConns int `yaml:"max_idle_connections" env:"MAX_IDLE_CONNECTIONS"` // maximum number of idle connections to any one target + EnableQueryMetrics bool `yaml:"enable_query_metrics,omitempty" env:"ENABLE_QUERY_METRICS"` // expose per-query duration and row count metrics + // Catches all undefined fields and must be empty after parsing. XXX map[string]any `yaml:",inline" json:"-"` } @@ -41,6 +43,7 @@ func (g *GlobalConfig) UnmarshalYAML(unmarshal func(any) error) error { g.MaxIdleConns = 3 g.MaxConnLifetime = time.Duration(0) g.WarmupDelay = model.Duration(0) + g.EnableQueryMetrics = false type plain GlobalConfig if err := unmarshal((*plain)(g)); err != nil { diff --git a/query.go b/query.go index 205312ad..a360caf4 100644 --- a/query.go +++ b/query.go @@ -5,10 +5,13 @@ import ( "database/sql" "fmt" "log/slog" + "sort" "time" "github.com/burningalchemist/sql_exporter/config" "github.com/burningalchemist/sql_exporter/errors" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" ) // Query wraps a sql.Stmt and all the metrics populated from it. It helps extract keys and values from result rows. @@ -19,6 +22,9 @@ type Query struct { columnTypes columnTypeMap logContext string + durationDesc MetricDesc + rowsDesc MetricDesc + conn *sql.DB stmt *sql.Stmt } @@ -35,7 +41,7 @@ const ( ) // NewQuery returns a new Query that will populate the given metric families. -func NewQuery(logContext string, qc *config.QueryConfig, metricFamilies ...*MetricFamily) (*Query, errors.WithContext) { +func NewQuery(logContext string, qc *config.QueryConfig, constLabels []*dto.LabelPair, enableQueryMetrics bool, metricFamilies ...*MetricFamily) (*Query, errors.WithContext) { logContext = TrimMissingCtx(fmt.Sprintf(`%s,query=%s`, logContext, qc.Name)) columnTypes := make(columnTypeMap) @@ -58,11 +64,30 @@ func NewQuery(logContext string, qc *config.QueryConfig, metricFamilies ...*Metr } } + var durationDesc, rowsDesc MetricDesc + if enableQueryMetrics { + autoLabels := make([]*dto.LabelPair, 0, len(constLabels)+1) + autoLabels = append(autoLabels, constLabels...) + queryName := qc.Name + queryLabel := queryLabelName + autoLabels = append(autoLabels, &dto.LabelPair{ + Name: &queryLabel, + Value: &queryName, + }) + sort.Sort(labelPairSorter(autoLabels)) + durationDesc = NewAutomaticMetricDesc(logContext, queryDurationName, queryDurationHelp, + prometheus.GaugeValue, autoLabels) + rowsDesc = NewAutomaticMetricDesc(logContext, queryRowsName, queryRowsHelp, + prometheus.GaugeValue, autoLabels) + } + q := Query{ config: qc, metricFamilies: metricFamilies, columnTypes: columnTypes, logContext: logContext, + durationDesc: durationDesc, + rowsDesc: rowsDesc, } return &q, nil } @@ -82,6 +107,15 @@ func setColumnType(logContext, columnName string, ctype columnType, columnTypes // Collect is the equivalent of prometheus.Collector.Collect() but takes a context to run in and a database to run on. func (q *Query) Collect(ctx context.Context, conn *sql.DB, ch chan<- Metric) { + start := time.Now() + var rowCount uint64 + defer func() { + if q.durationDesc != nil { + ch <- NewMetric(q.durationDesc, time.Since(start).Seconds()) + ch <- NewMetric(q.rowsDesc, float64(rowCount)) + } + }() + if ctx.Err() != nil { ch <- NewInvalidMetric(errors.Wrap(q.logContext, ctx.Err())) @@ -114,6 +148,7 @@ func (q *Query) Collect(ctx context.Context, conn *sql.DB, ch chan<- Metric) { ch <- NewInvalidMetric(err) continue } + rowCount++ for _, mf := range q.metricFamilies { mf.Collect(row, ch) } diff --git a/query_test.go b/query_test.go new file mode 100644 index 00000000..0d8e0657 --- /dev/null +++ b/query_test.go @@ -0,0 +1,66 @@ +package sql_exporter + +import ( + "testing" + + "github.com/burningalchemist/sql_exporter/config" + dto "github.com/prometheus/client_model/go" +) + +func TestNewQueryAutoMetricsDisabled(t *testing.T) { + q, err := NewQuery("", &config.QueryConfig{Name: "q1", Query: "SELECT 1"}, nil, false) + if err != nil { + t.Fatalf("NewQuery: %v", err) + } + if q.durationDesc != nil || q.rowsDesc != nil { + t.Fatalf("expected no auto-metric descs when disabled, got duration=%v rows=%v", q.durationDesc, q.rowsDesc) + } +} + +func TestNewQueryAutoMetricsEnabled(t *testing.T) { + targetName, targetVal := "target", "db1" + constLabels := []*dto.LabelPair{{Name: &targetName, Value: &targetVal}} + + q, err := NewQuery("", &config.QueryConfig{Name: "q1", Query: "SELECT 1"}, constLabels, true) + if err != nil { + t.Fatalf("NewQuery: %v", err) + } + if q.durationDesc == nil || q.rowsDesc == nil { + t.Fatalf("expected auto-metric descs to be set when enabled") + } + if got := q.durationDesc.Name(); got != queryDurationName { + t.Errorf("duration metric name = %q, want %q", got, queryDurationName) + } + if got := q.rowsDesc.Name(); got != queryRowsName { + t.Errorf("rows metric name = %q, want %q", got, queryRowsName) + } + + gotLabels := q.durationDesc.ConstLabels() + if len(gotLabels) != 2 { + t.Fatalf("expected 2 const labels (target, query), got %d", len(gotLabels)) + } + labels := make(map[string]string, len(gotLabels)) + for _, lp := range gotLabels { + labels[lp.GetName()] = lp.GetValue() + } + if labels[queryLabelName] != "q1" { + t.Errorf("query label = %q, want q1", labels[queryLabelName]) + } + if labels["target"] != "db1" { + t.Errorf("target label = %q, want db1", labels["target"]) + } +} + +func TestNewQueryAutoMetricsEnabledNoConstLabels(t *testing.T) { + q, err := NewQuery("", &config.QueryConfig{Name: "singleton", Query: "SELECT 1"}, nil, true) + if err != nil { + t.Fatalf("NewQuery: %v", err) + } + gotLabels := q.durationDesc.ConstLabels() + if len(gotLabels) != 1 { + t.Fatalf("expected just the query label, got %d labels", len(gotLabels)) + } + if gotLabels[0].GetName() != queryLabelName || gotLabels[0].GetValue() != "singleton" { + t.Errorf("expected query=singleton, got %s=%s", gotLabels[0].GetName(), gotLabels[0].GetValue()) + } +} diff --git a/target.go b/target.go index 75fb0e31..11608e81 100644 --- a/target.go +++ b/target.go @@ -24,6 +24,11 @@ const ( upMetricHelp = "1 if the target is reachable, or 0 if the scrape failed" scrapeDurationName = "scrape_duration_seconds" scrapeDurationHelp = "How long it took to scrape the target in seconds" + queryDurationName = "query_duration_seconds" + queryDurationHelp = "How long the named query took to execute in seconds (last scrape)" + queryRowsName = "query_rows_returned" + queryRowsHelp = "Number of rows returned by the named query (last scrape)" + queryLabelName = "query" ) // Target collects SQL metrics from a single sql.DB instance. It aggregates one or more Collectors and it looks much @@ -86,7 +91,7 @@ func NewTarget( collectors := make([]Collector, 0, len(ccs)) for _, cc := range ccs { - c, err := NewCollector(logContext, cc, constLabelPairs) + c, err := NewCollector(logContext, cc, constLabelPairs, gc.EnableQueryMetrics) if err != nil { return nil, err }