diff --git a/src/content/docs/opentelemetry/integrations/kafka/kubernetes-self-managed.mdx b/src/content/docs/opentelemetry/integrations/kafka/kubernetes-self-managed.mdx new file mode 100644 index 00000000000..3d789307781 --- /dev/null +++ b/src/content/docs/opentelemetry/integrations/kafka/kubernetes-self-managed.mdx @@ -0,0 +1,4716 @@ +--- +title: Monitor self-managed Kafka on Kubernetes with OpenTelemetry +tags: + - Integrations + - OpenTelemetry + - Kafka + - Kubernetes + - Self-managed +metaDescription: "Deploy OpenTelemetry Collector on Kubernetes to monitor self-managed Kafka clusters using the OpenTelemetry Java agent or Prometheus JMX Exporter." +freshnessValidatedDate: never +--- + +Monitor your self-managed Apache Kafka cluster running on Kubernetes by deploying the OpenTelemetry Collector to gather and forward metrics to New Relic. + +## Architecture [#architecture-overview] + +New Relic supports two approaches for monitoring self-managed Kafka on Kubernetes: using the OpenTelemetry Java agent or the Prometheus JMX Exporter. The following diagrams illustrate the data flow for each approach. + +Kubernetes self-managed Kafka monitoring architecture + +## Installation steps [#installation-steps] + + + + Via OTel Java agent + Via Prometheus JMX Exporter + + + + +Follow these steps to set up comprehensive Kafka monitoring. You'll install the OpenTelemetry Java agent on your brokers and deploy a collector to gather and send metrics and logs to New Relic. + + + + + +### Before you begin [#prerequisites] + +Ensure you have: + +* A [New Relic account](https://newrelic.com/signup) with a +* A Kubernetes cluster with `kubectl` access +* A Kafka cluster deployed as a StatefulSet +* Ability to modify and redeploy the Kafka StatefulSet + + + + + +### Deploy OpenTelemetry Collector [#deploy-opentelemetry-collector] + +Deploy the OpenTelemetry Collector in your cluster. This step also creates the `kafka-jmx-config` ConfigMap that defines which JMX metrics the Java agent collects from each broker pod. The Collector must be running before you restart the Kafka brokers in the next step. + + + + Helm install (recommended) + Manifest install + + + + + +1. Create New Relic credentials secret + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.nr-data.net:4317' + ``` + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.eu01.nr-data.net:4317' + ``` + + + + + For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol). + + +2. Create values.yaml with collector configuration + +Both NRDOT and OpenTelemetry Collectors use identical configuration. Choose your preferred collector image: + + + + **NRDOT** is New Relic's supported distribution of the OpenTelemetry Collector, providing full New Relic support. For more information, see the [NRDOT Collector GitHub repository](https://github.com/newrelic/nrdot-collector-releases/tree/main/distributions/nrdot-collector). + + Create `values.yaml` with the following content: + +```yaml +mode: deployment +replicaCount: 1 + +image: + repository: newrelic/nrdot-collector + tag: "latest" + pullPolicy: Always + +serviceAccount: + create: true + name: otel-collector + +podSecurityContext: + runAsNonRoot: true + runAsUser: 10001 + +securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +resources: + requests: + memory: 512Mi + cpu: 250m + limits: + memory: 1Gi + cpu: 500m + +extraEnvsFrom: + - secretRef: + name: newrelic-otlp-secret + +# Disable unused default ports +ports: + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + +config: + receivers: + # Disable default receivers not needed in NRDOT + jaeger: null + zipkin: null + + # OTLP receiver: receives Kafka JMX metrics from broker pods (via Java agent) and app telemetry + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + # Kafka metrics receiver for consumer lag, topic, and partition metrics + kafkametrics: + brokers: + # Format: ..svc.cluster.local: + - "kafka.kafka.svc.cluster.local:9092" + collection_interval: 30s + protocol_version: 2.0.0 + scrapers: + - brokers + - topics + - consumers + topic_match: "^[^_].*$" + metrics: + kafka.topic.min_insync_replicas: + enabled: true + kafka.topic.replication_factor: + enabled: true + kafka.partition.replicas: + enabled: false + kafka.partition.oldest_offset: + enabled: false + kafka.partition.current_offset: + enabled: false + + exporters: + otlp/newrelic: + endpoint: ${NEW_RELIC_OTLP_ENDPOINT} + tls: + insecure: false + sending_queue: + num_consumers: 12 + queue_size: 5000 + retry_on_failure: + enabled: true + compression: gzip + timeout: 30s + headers: + api-key: ${NEW_RELIC_LICENSE_KEY} + + processors: + batch/aggregation: + send_batch_size: 1024 + timeout: 30s + + resource: + attributes: + - action: insert + key: kafka.cluster.name + value: my-kafka-cluster + + transform/remove_broker_id: + metric_statements: + - context: resource + statements: + - delete_key(attributes, "broker.id") + + transform/remove_extra_attributes: + metric_statements: + - context: resource + statements: + - delete_matching_keys(attributes, "^process\\..*") + - delete_matching_keys(attributes, "^telemetry\\..*") + - delete_key(attributes, "host.arch") + - delete_key(attributes, "os.description") + - delete_matching_keys(attributes, "^cloud\\..*") + - delete_key(attributes, "service.instance.id") where IsMatch(attributes["service.name"], "^unknown_service:") + - delete_key(attributes, "service.name") where IsMatch(attributes["service.name"], "^unknown_service:") + + transform/des_units: + metric_statements: + - context: metric + statements: + - set(description, "") where description != "" + - set(unit, "") where unit != "" + + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + + filter/include_cluster_metrics: + metrics: + include: + match_type: regexp + metric_names: + - "kafka\\.partition\\.offline" + - "kafka\\.(leader|unclean)\\.election\\.rate" + - "kafka\\.partition\\.non_preferred_leader" + - "kafka\\.broker\\.fenced\\.count" + - "kafka\\.cluster\\.partition\\.count" + - "kafka\\.cluster\\.topic\\.count" + + filter/exclude_cluster_metrics: + metrics: + exclude: + match_type: regexp + metric_names: + - "kafka\\.partition\\.offline" + - "kafka\\.(leader|unclean)\\.election\\.rate" + - "kafka\\.partition\\.non_preferred_leader" + - "kafka\\.broker\\.fenced\\.count" + - "kafka\\.cluster\\.partition\\.count" + - "kafka\\.cluster\\.topic\\.count" + + cumulativetodelta: + + metricstransform/kafka_topic_sum_aggregation: + transforms: + - include: kafka.partition.replicas_in_sync + action: insert + new_name: kafka.partition.replicas_in_sync.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + - include: kafka.partition.replicas + action: insert + new_name: kafka.partition.replicas.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + + filter/remove_partition_level_replicas: + metrics: + exclude: + match_type: strict + metric_names: + - kafka.partition.replicas_in_sync + + groupbyattrs/cluster: + keys: [kafka.cluster.name] + + metricstransform/cluster_max: + transforms: + - include: "kafka\\.partition\\.offline|kafka\\.leader\\.election\\.rate|kafka\\.unclean\\.election\\.rate|kafka\\.partition\\.non_preferred_leader|kafka\\.broker\\.fenced\\.count|kafka\\.cluster\\.partition\\.count|kafka\\.cluster\\.topic\\.count" + match_type: regexp + action: update + operations: + - action: aggregate_labels + aggregation_type: max + label_set: [] + + service: + pipelines: + # Null out the Helm chart's default pipelines — they reference the jaeger/zipkin + # receivers we disabled above, which causes a startup error if left enabled. + traces: null + logs: null + metrics: null + + # Broker metrics pipeline (excludes cluster-level metrics) + metrics/broker: + receivers: [otlp, kafkametrics] + processors: + - resource + - filter/exclude_cluster_metrics + - filter/internal_topics + - transform/remove_extra_attributes + - transform/des_units + - cumulativetodelta + - metricstransform/kafka_topic_sum_aggregation + - filter/remove_partition_level_replicas + - batch/aggregation + exporters: [otlp/newrelic] + + # Cluster metrics pipeline (only cluster-level metrics, no broker.id) + metrics/cluster: + receivers: [otlp] + processors: + - resource + - filter/include_cluster_metrics + - transform/remove_broker_id + - transform/remove_extra_attributes + - transform/des_units + - cumulativetodelta + - groupbyattrs/cluster + - metricstransform/cluster_max + - batch/aggregation + exporters: [otlp/newrelic] + + # APM traces pipeline (producer + consumer spans via OTel Java agent) + traces/apps: + receivers: [otlp] + processors: [resource, batch/aggregation] + exporters: [otlp/newrelic] + + # APM logs pipeline (producer + consumer logs via OTel Java agent) + logs/apps: + receivers: [otlp] + processors: [resource, batch/aggregation] + exporters: [otlp/newrelic] + +extraObjects: + - apiVersion: v1 + kind: ConfigMap + metadata: + name: kafka-jmx-config + namespace: kafka + data: + kafka-jmx-config.yaml: | + --- + rules: + # Per-topic custom metrics + - bean: kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec,topic=* + metricAttribute: + topic: param(topic) + mapping: + Count: + metric: kafka.prod.msg.count + type: counter + desc: The number of messages per topic + unit: "{message}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec,topic=* + metricAttribute: + topic: param(topic) + direction: const(in) + mapping: + Count: + metric: kafka.topic.io + type: counter + desc: The bytes received or sent per topic + unit: By + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec,topic=* + metricAttribute: + topic: param(topic) + direction: const(out) + mapping: + Count: + metric: kafka.topic.io + type: counter + desc: The bytes received or sent per topic + unit: By + + # Cluster-level metrics + - bean: kafka.controller:type=KafkaController,name=GlobalTopicCount + mapping: + Value: + metric: kafka.cluster.topic.count + type: gauge + desc: The total number of global topics in the cluster + unit: "{topic}" + + - bean: kafka.controller:type=KafkaController,name=GlobalPartitionCount + mapping: + Value: + metric: kafka.cluster.partition.count + type: gauge + desc: The total number of global partitions in the cluster + unit: "{partition}" + + - bean: kafka.controller:type=KafkaController,name=FencedBrokerCount + mapping: + Value: + metric: kafka.broker.fenced.count + type: gauge + desc: The number of fenced brokers in the cluster + unit: "{broker}" + + - bean: kafka.controller:type=KafkaController,name=PreferredReplicaImbalanceCount + mapping: + Value: + metric: kafka.partition.non_preferred_leader + type: gauge + desc: The count of topic partitions for which the leader is not the preferred leader + unit: "{partition}" + + # Broker-level metrics + - bean: kafka.server:type=ReplicaManager,name=UnderMinIsrPartitionCount + mapping: + Value: + metric: kafka.partition.under_min_isr + type: gauge + desc: The number of partitions where the number of in-sync replicas is less than the minimum + unit: "{partition}" + + - bean: java.lang:type=Runtime + mapping: + Uptime: + metric: kafka.broker.uptime + type: gauge + desc: Broker uptime in milliseconds + unit: ms + + - bean: kafka.server:type=ReplicaManager,name=LeaderCount + mapping: + Value: + metric: kafka.broker.leader.count + type: gauge + desc: Number of partitions for which this broker is the leader + unit: "{partition}" + + # JVM metrics + - bean: java.lang:type=GarbageCollector,name=* + mapping: + CollectionCount: + metric: jvm.gc.collections.count + type: counter + unit: "{collection}" + desc: total number of collections that have occurred + metricAttribute: + name: param(name) + + - bean: java.lang:type=Memory + unit: By + prefix: jvm.memory. + dropNegativeValues: true + mapping: + HeapMemoryUsage.max: + metric: heap.max + desc: current heap usage + type: gauge + HeapMemoryUsage.used: + metric: heap.used + desc: current heap usage + type: gauge + + - bean: java.lang:type=Threading + mapping: + ThreadCount: + metric: jvm.thread.count + type: gauge + unit: "{thread}" + desc: Total thread count + + - bean: java.lang:type=OperatingSystem + prefix: jvm. + dropNegativeValues: true + mapping: + SystemCpuLoad: + metric: system.cpu.utilization + type: gauge + unit: '1' + desc: Recent CPU utilization for whole system (0.0 to 1.0) + + - bean: kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec + mapping: + Count: + metric: kafka.message.count + type: counter + desc: The number of messages received by the broker + unit: "{message}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=TotalFetchRequestsPerSec + metricAttribute: + type: const(fetch) + mapping: + Count: + metric: &metric kafka.request.count + type: &type counter + desc: &desc The number of requests received by the broker + unit: &unit "{request}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=TotalProduceRequestsPerSec + metricAttribute: + type: const(produce) + mapping: + Count: + metric: *metric + type: *type + desc: *desc + unit: *unit + + - bean: kafka.server:type=BrokerTopicMetrics,name=FailedFetchRequestsPerSec + metricAttribute: + type: const(fetch) + mapping: + Count: + metric: &metric kafka.request.failed + type: &type counter + desc: &desc The number of requests to the broker resulting in a failure + unit: &unit "{request}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=FailedProduceRequestsPerSec + metricAttribute: + type: const(produce) + mapping: + Count: + metric: *metric + type: *type + desc: *desc + unit: *unit + + - beans: + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=Produce + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchConsumer + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchFollower + metricAttribute: + type: param(request) + unit: ms + mapping: + 99thPercentile: + metric: kafka.request.time.99p + type: gauge + desc: The 99th percentile time the broker has taken to service requests + + - bean: kafka.network:type=RequestChannel,name=RequestQueueSize + mapping: + Value: + metric: kafka.request.queue + type: gauge + desc: Size of the request queue + unit: "{request}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec + metricAttribute: + direction: const(in) + mapping: + Count: + metric: &metric kafka.network.io + type: &type counter + desc: &desc The bytes received or sent by the broker + unit: &unit By + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec + metricAttribute: + direction: const(out) + mapping: + Count: + metric: *metric + type: *type + desc: *desc + unit: *unit + + - beans: + - kafka.server:type=DelayedOperationPurgatory,name=PurgatorySize,delayedOperation=Produce + - kafka.server:type=DelayedOperationPurgatory,name=PurgatorySize,delayedOperation=Fetch + metricAttribute: + type: param(delayedOperation) + mapping: + Value: + metric: kafka.purgatory.size + type: gauge + desc: The number of requests waiting in purgatory + unit: "{request}" + + - bean: kafka.server:type=ReplicaManager,name=PartitionCount + mapping: + Value: + metric: kafka.partition.count + type: gauge + desc: The number of partitions on the broker + unit: "{partition}" + + - bean: kafka.controller:type=KafkaController,name=OfflinePartitionsCount + mapping: + Value: + metric: kafka.partition.offline + type: gauge + desc: The number of partitions offline + unit: "{partition}" + + - bean: kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions + mapping: + Value: + metric: kafka.partition.under_replicated + type: gauge + desc: The number of under replicated partitions + unit: "{partition}" + + - bean: kafka.server:type=ReplicaManager,name=IsrShrinksPerSec + metricAttribute: + operation: const(shrink) + mapping: + Count: + metric: kafka.isr.operation.count + type: counter + desc: The number of in-sync replica shrink and expand operations + unit: "{operation}" + + - bean: kafka.server:type=ReplicaManager,name=IsrExpandsPerSec + metricAttribute: + operation: const(expand) + mapping: + Count: + metric: kafka.isr.operation.count + type: counter + desc: The number of in-sync replica shrink and expand operations + unit: "{operation}" + + - bean: kafka.server:type=ReplicaFetcherManager,name=MaxLag,clientId=Replica + mapping: + Value: + metric: kafka.max.lag + type: gauge + desc: The max lag in messages between follower and leader replicas + unit: "{message}" + + - bean: kafka.controller:type=KafkaController,name=ActiveControllerCount + mapping: + Value: + metric: kafka.controller.active.count + type: gauge + desc: Number of active controllers in the cluster + unit: "{controller}" + + - bean: kafka.controller:type=ControllerStats,name=LeaderElectionRateAndTimeMs + mapping: + Count: + metric: kafka.leader.election.rate + type: counter + desc: The leader election count + unit: "{election}" + + - bean: kafka.controller:type=ControllerStats,name=UncleanLeaderElectionsPerSec + mapping: + Count: + metric: kafka.unclean.election.rate + type: counter + desc: Unclean leader election count + unit: "{election}" + + # ── Additional metrics — remove this section to reduce data ingest ─────────── + + - beans: + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=Produce + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchConsumer + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchFollower + metricAttribute: + type: param(request) + unit: ms + mapping: + Count: + metric: kafka.request.time.total + type: counter + desc: The total time the broker has taken to service requests + 50thPercentile: + metric: kafka.request.time.50p + type: gauge + desc: The 50th percentile time the broker has taken to service requests + Mean: + metric: kafka.request.time.avg + type: gauge + desc: The average time the broker has taken to service requests + + - bean: kafka.log:type=LogFlushStats,name=LogFlushRateAndTimeMs + unit: ms + type: gauge + prefix: kafka.logs.flush. + mapping: + Count: + metric: count + unit: '{flush}' + type: counter + desc: Log flush count + 50thPercentile: + metric: time.50p + desc: Log flush time - 50th percentile + 99thPercentile: + metric: time.99p + desc: Log flush time - 99th percentile + + - bean: java.lang:type=GarbageCollector,name=* + mapping: + CollectionTime: + metric: jvm.gc.collections.elapsed + type: counter + unit: ms + desc: the approximate accumulated collection elapsed time in milliseconds + metricAttribute: + name: param(name) + + - bean: java.lang:type=ClassLoading + mapping: + LoadedClassCount: + metric: jvm.class.count + type: gauge + unit: "{class}" + desc: Currently loaded class count + + - bean: java.lang:type=Memory + unit: By + prefix: jvm.memory. + dropNegativeValues: true + mapping: + HeapMemoryUsage.committed: + metric: heap.committed + desc: Committed heap memory + type: gauge + + - bean: java.lang:type=OperatingSystem + prefix: jvm. + dropNegativeValues: true + mapping: + SystemLoadAverage: + metric: system.cpu.load_1m + type: gauge + unit: "{run_queue_item}" + desc: System load average (1 minute) + AvailableProcessors: + metric: cpu.count + type: gauge + unit: "{cpu}" + desc: Number of processors available + ProcessCpuLoad: + metric: cpu.recent_utilization + type: gauge + unit: '1' + desc: Recent CPU utilization for JVM process (0.0 to 1.0) + OpenFileDescriptorCount: + metric: file_descriptor.count + type: gauge + unit: "{file_descriptor}" + desc: Number of open file descriptors + + - bean: java.lang:type=MemoryPool,name=* + type: gauge + unit: By + metricAttribute: + name: param(name) + mapping: + Usage.used: + metric: jvm.memory.pool.used + desc: Memory pool usage by generation + Usage.max: + metric: jvm.memory.pool.max + desc: Maximum memory pool size + CollectionUsage.used: + metric: jvm.memory.pool.used_after_last_gc + desc: Memory used after last GC +``` + +#### Configuration parameters + +The following table describes the key configuration parameters for Kafka monitoring with NRDOT Collector: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`NEW_RELIC_OTLP_ENDPOINT`New Relic OTLP endpoint for your region. For more information, refer to New Relic OTLP endpoints documentation.None (Required)
`NEW_RELIC_LICENSE_KEY`Your New Relic license key for authenticationNone (Required)
`kafkametrics.brokers`Kafka bootstrap service DNS. Format: `..svc.cluster.local:``kafka.kafka.svc.cluster.local:9092`
`resource.attributes.kafka.cluster.name`Name to identify your Kafka cluster in New Relic`my-kafka-cluster`
`extraObjects.namespace`Kubernetes namespace where your Kafka cluster is deployed`kafka`
`kafkametrics.collection_interval`Interval for collecting Kafka metrics`30s`
`kafkametrics.protocol_version`Kafka protocol version for compatibility`2.0.0`
`replicaCount`Number of collector replicas to deploy`1`
`resources.requests.memory`Minimum memory allocation for the collector`512Mi`
`resources.requests.cpu`Minimum CPU allocation for the collector`250m`
`resources.limits.memory`Maximum memory limit for the collector`1Gi`
`resources.limits.cpu`Maximum CPU limit for the collector`500m`
`image.repository`Container image repository for the collector`newrelic/nrdot-collector`
`image.tag`Container image tag to use`latest`
`batch/aggregation.send_batch_size`Number of metrics to batch before sending`1024`
`batch/aggregation.timeout`Maximum time to wait before sending a batch`30s`
`otlp/newrelic.timeout`Timeout for sending data to New Relic`30s`
`sending_queue.num_consumers`Number of consumers processing the sending queue`12`
`sending_queue.queue_size`Size of the sending queue buffer`5000`
+ + +
+ + + Use the community **OpenTelemetry Collector** for maximum flexibility and vendor-neutral deployment. + + Create `values.yaml` with the same content as the NRDOT option above, but change the image: + +```yaml +image: + repository: otel/opentelemetry-collector-contrib + tag: "latest" + pullPolicy: Always +``` + + All other configuration (receivers, processors, pipelines, and `extraObjects`) is identical. + + **Customize for your cluster**: Use the same configuration parameters as the NRDOT option above, including resource limits. + + +
+ +For advanced configuration options, see: +- [OTLP receiver documentation](https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver) +- [Kafka metrics receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kafkametricsreceiver) + +3. Install OpenTelemetry Collector with Helm + +```bash +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm upgrade kafka-monitoring open-telemetry/opentelemetry-collector \ + --install \ + --namespace newrelic \ + --create-namespace \ + -f values.yaml +``` + +4. Verify the deployment + +```bash +# Check pod status +kubectl get pods -n newrelic -l app.kubernetes.io/name=opentelemetry-collector + +# View logs to verify metrics are being received from broker pods +kubectl logs -n newrelic -l app.kubernetes.io/name=opentelemetry-collector --tail=50 +``` + +
+ + + +**Create New Relic credentials secret** + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.nr-data.net:4317' + ``` + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.eu01.nr-data.net:4317' + ``` + + + + + For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol). + + +**Create manifest files** + +Both NRDOT and OpenTelemetry Collectors use identical configuration. Only the container image differs. Both also require the `kafka-jmx-config` ConfigMap applied to your Kafka namespace. + +**Create `kafka-jmx-config.yaml`** - JMX metrics configuration for the Java agent (apply to your Kafka namespace): + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: kafka-jmx-config + namespace: kafka +data: + kafka-jmx-config.yaml: | + --- + rules: + # Per-topic custom metrics + - bean: kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec,topic=* + metricAttribute: + topic: param(topic) + mapping: + Count: + metric: kafka.prod.msg.count + type: counter + desc: The number of messages per topic + unit: "{message}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec,topic=* + metricAttribute: + topic: param(topic) + direction: const(in) + mapping: + Count: + metric: kafka.topic.io + type: counter + desc: The bytes received or sent per topic + unit: By + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec,topic=* + metricAttribute: + topic: param(topic) + direction: const(out) + mapping: + Count: + metric: kafka.topic.io + type: counter + desc: The bytes received or sent per topic + unit: By + + # Cluster-level metrics + - bean: kafka.controller:type=KafkaController,name=GlobalTopicCount + mapping: + Value: + metric: kafka.cluster.topic.count + type: gauge + desc: The total number of global topics in the cluster + unit: "{topic}" + + - bean: kafka.controller:type=KafkaController,name=GlobalPartitionCount + mapping: + Value: + metric: kafka.cluster.partition.count + type: gauge + desc: The total number of global partitions in the cluster + unit: "{partition}" + + - bean: kafka.controller:type=KafkaController,name=FencedBrokerCount + mapping: + Value: + metric: kafka.broker.fenced.count + type: gauge + desc: The number of fenced brokers in the cluster + unit: "{broker}" + + - bean: kafka.controller:type=KafkaController,name=PreferredReplicaImbalanceCount + mapping: + Value: + metric: kafka.partition.non_preferred_leader + type: gauge + desc: The count of topic partitions for which the leader is not the preferred leader + unit: "{partition}" + + # Broker-level metrics + - bean: kafka.server:type=ReplicaManager,name=UnderMinIsrPartitionCount + mapping: + Value: + metric: kafka.partition.under_min_isr + type: gauge + desc: The number of partitions where the number of in-sync replicas is less than the minimum + unit: "{partition}" + + - bean: java.lang:type=Runtime + mapping: + Uptime: + metric: kafka.broker.uptime + type: gauge + desc: Broker uptime in milliseconds + unit: ms + + - bean: kafka.server:type=ReplicaManager,name=LeaderCount + mapping: + Value: + metric: kafka.broker.leader.count + type: gauge + desc: Number of partitions for which this broker is the leader + unit: "{partition}" + + # JVM metrics + - bean: java.lang:type=GarbageCollector,name=* + mapping: + CollectionCount: + metric: jvm.gc.collections.count + type: counter + unit: "{collection}" + desc: total number of collections that have occurred + metricAttribute: + name: param(name) + + - bean: java.lang:type=Memory + unit: By + prefix: jvm.memory. + dropNegativeValues: true + mapping: + HeapMemoryUsage.max: + metric: heap.max + desc: current heap usage + type: gauge + HeapMemoryUsage.used: + metric: heap.used + desc: current heap usage + type: gauge + + - bean: java.lang:type=Threading + mapping: + ThreadCount: + metric: jvm.thread.count + type: gauge + unit: "{thread}" + desc: Total thread count + + - bean: java.lang:type=OperatingSystem + prefix: jvm. + dropNegativeValues: true + mapping: + SystemCpuLoad: + metric: system.cpu.utilization + type: gauge + unit: '1' + desc: Recent CPU utilization for whole system (0.0 to 1.0) + + - bean: kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec + mapping: + Count: + metric: kafka.message.count + type: counter + desc: The number of messages received by the broker + unit: "{message}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=TotalFetchRequestsPerSec + metricAttribute: + type: const(fetch) + mapping: + Count: + metric: &metric kafka.request.count + type: &type counter + desc: &desc The number of requests received by the broker + unit: &unit "{request}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=TotalProduceRequestsPerSec + metricAttribute: + type: const(produce) + mapping: + Count: + metric: *metric + type: *type + desc: *desc + unit: *unit + + - bean: kafka.server:type=BrokerTopicMetrics,name=FailedFetchRequestsPerSec + metricAttribute: + type: const(fetch) + mapping: + Count: + metric: &metric kafka.request.failed + type: &type counter + desc: &desc The number of requests to the broker resulting in a failure + unit: &unit "{request}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=FailedProduceRequestsPerSec + metricAttribute: + type: const(produce) + mapping: + Count: + metric: *metric + type: *type + desc: *desc + unit: *unit + + - beans: + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=Produce + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchConsumer + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchFollower + metricAttribute: + type: param(request) + unit: ms + mapping: + 99thPercentile: + metric: kafka.request.time.99p + type: gauge + desc: The 99th percentile time the broker has taken to service requests + + - bean: kafka.network:type=RequestChannel,name=RequestQueueSize + mapping: + Value: + metric: kafka.request.queue + type: gauge + desc: Size of the request queue + unit: "{request}" + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec + metricAttribute: + direction: const(in) + mapping: + Count: + metric: &metric kafka.network.io + type: &type counter + desc: &desc The bytes received or sent by the broker + unit: &unit By + + - bean: kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec + metricAttribute: + direction: const(out) + mapping: + Count: + metric: *metric + type: *type + desc: *desc + unit: *unit + + - beans: + - kafka.server:type=DelayedOperationPurgatory,name=PurgatorySize,delayedOperation=Produce + - kafka.server:type=DelayedOperationPurgatory,name=PurgatorySize,delayedOperation=Fetch + metricAttribute: + type: param(delayedOperation) + mapping: + Value: + metric: kafka.purgatory.size + type: gauge + desc: The number of requests waiting in purgatory + unit: "{request}" + + - bean: kafka.server:type=ReplicaManager,name=PartitionCount + mapping: + Value: + metric: kafka.partition.count + type: gauge + desc: The number of partitions on the broker + unit: "{partition}" + + - bean: kafka.controller:type=KafkaController,name=OfflinePartitionsCount + mapping: + Value: + metric: kafka.partition.offline + type: gauge + desc: The number of partitions offline + unit: "{partition}" + + - bean: kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions + mapping: + Value: + metric: kafka.partition.under_replicated + type: gauge + desc: The number of under replicated partitions + unit: "{partition}" + + - bean: kafka.server:type=ReplicaManager,name=IsrShrinksPerSec + metricAttribute: + operation: const(shrink) + mapping: + Count: + metric: kafka.isr.operation.count + type: counter + desc: The number of in-sync replica shrink and expand operations + unit: "{operation}" + + - bean: kafka.server:type=ReplicaManager,name=IsrExpandsPerSec + metricAttribute: + operation: const(expand) + mapping: + Count: + metric: kafka.isr.operation.count + type: counter + desc: The number of in-sync replica shrink and expand operations + unit: "{operation}" + + - bean: kafka.server:type=ReplicaFetcherManager,name=MaxLag,clientId=Replica + mapping: + Value: + metric: kafka.max.lag + type: gauge + desc: The max lag in messages between follower and leader replicas + unit: "{message}" + + - bean: kafka.controller:type=KafkaController,name=ActiveControllerCount + mapping: + Value: + metric: kafka.controller.active.count + type: gauge + desc: Number of active controllers in the cluster + unit: "{controller}" + + - bean: kafka.controller:type=ControllerStats,name=LeaderElectionRateAndTimeMs + mapping: + Count: + metric: kafka.leader.election.rate + type: counter + desc: The leader election count + unit: "{election}" + + - bean: kafka.controller:type=ControllerStats,name=UncleanLeaderElectionsPerSec + mapping: + Count: + metric: kafka.unclean.election.rate + type: counter + desc: Unclean leader election count + unit: "{election}" + + # ── Additional metrics — remove this section to reduce data ingest ─────────── + + - beans: + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=Produce + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchConsumer + - kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchFollower + metricAttribute: + type: param(request) + unit: ms + mapping: + Count: + metric: kafka.request.time.total + type: counter + desc: The total time the broker has taken to service requests + 50thPercentile: + metric: kafka.request.time.50p + type: gauge + desc: The 50th percentile time the broker has taken to service requests + Mean: + metric: kafka.request.time.avg + type: gauge + desc: The average time the broker has taken to service requests + + - bean: kafka.log:type=LogFlushStats,name=LogFlushRateAndTimeMs + unit: ms + type: gauge + prefix: kafka.logs.flush. + mapping: + Count: + metric: count + unit: '{flush}' + type: counter + desc: Log flush count + 50thPercentile: + metric: time.50p + desc: Log flush time - 50th percentile + 99thPercentile: + metric: time.99p + desc: Log flush time - 99th percentile + + - bean: java.lang:type=GarbageCollector,name=* + mapping: + CollectionTime: + metric: jvm.gc.collections.elapsed + type: counter + unit: ms + desc: the approximate accumulated collection elapsed time in milliseconds + metricAttribute: + name: param(name) + + - bean: java.lang:type=ClassLoading + mapping: + LoadedClassCount: + metric: jvm.class.count + type: gauge + unit: "{class}" + desc: Currently loaded class count + + - bean: java.lang:type=Memory + unit: By + prefix: jvm.memory. + dropNegativeValues: true + mapping: + HeapMemoryUsage.committed: + metric: heap.committed + desc: Committed heap memory + type: gauge + + - bean: java.lang:type=OperatingSystem + prefix: jvm. + dropNegativeValues: true + mapping: + SystemLoadAverage: + metric: system.cpu.load_1m + type: gauge + unit: "{run_queue_item}" + desc: System load average (1 minute) + AvailableProcessors: + metric: cpu.count + type: gauge + unit: "{cpu}" + desc: Number of processors available + ProcessCpuLoad: + metric: cpu.recent_utilization + type: gauge + unit: '1' + desc: Recent CPU utilization for JVM process (0.0 to 1.0) + OpenFileDescriptorCount: + metric: file_descriptor.count + type: gauge + unit: "{file_descriptor}" + desc: Number of open file descriptors + + - bean: java.lang:type=MemoryPool,name=* + type: gauge + unit: By + metricAttribute: + name: param(name) + mapping: + Usage.used: + metric: jvm.memory.pool.used + desc: Memory pool usage by generation + Usage.max: + metric: jvm.memory.pool.max + desc: Maximum memory pool size + CollectionUsage.used: + metric: jvm.memory.pool.used_after_last_gc + desc: Memory used after last GC +``` + + + + **NRDOT** is New Relic's supported distribution of the OpenTelemetry Collector, providing full New Relic support. For more information, see the [NRDOT Collector GitHub repository](https://github.com/newrelic/nrdot-collector-releases/tree/main/distributions/nrdot-collector). + + **1. Create `collector-configmap.yaml`** - OpenTelemetry Collector configuration: + +```yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: newrelic + labels: + app: otel-collector +data: + otel-collector-config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + kafkametrics: + brokers: + - "kafka.kafka.svc.cluster.local:9092" + collection_interval: 30s + protocol_version: 2.0.0 + scrapers: + - brokers + - topics + - consumers + topic_match: "^[^_].*$" + metrics: + kafka.topic.min_insync_replicas: + enabled: true + kafka.topic.replication_factor: + enabled: true + kafka.partition.replicas: + enabled: false + kafka.partition.oldest_offset: + enabled: false + kafka.partition.current_offset: + enabled: false + + exporters: + otlp/newrelic: + endpoint: ${NEW_RELIC_OTLP_ENDPOINT} + tls: + insecure: false + sending_queue: + num_consumers: 12 + queue_size: 5000 + retry_on_failure: + enabled: true + compression: gzip + timeout: 30s + headers: + api-key: ${NEW_RELIC_LICENSE_KEY} + + processors: + batch/aggregation: + send_batch_size: 1024 + timeout: 30s + resource: + attributes: + - action: insert + key: kafka.cluster.name + value: my-kafka-cluster + transform/remove_broker_id: + metric_statements: + - context: resource + statements: + - delete_key(attributes, "broker.id") + transform/remove_extra_attributes: + metric_statements: + - context: resource + statements: + - delete_matching_keys(attributes, "^process\\..*") + - delete_matching_keys(attributes, "^telemetry\\..*") + - delete_key(attributes, "host.arch") + - delete_key(attributes, "os.description") + - delete_matching_keys(attributes, "^cloud\\..*") + - delete_key(attributes, "service.instance.id") where IsMatch(attributes["service.name"], "^unknown_service:") + - delete_key(attributes, "service.name") where IsMatch(attributes["service.name"], "^unknown_service:") + transform/des_units: + metric_statements: + - context: metric + statements: + - set(description, "") where description != "" + - set(unit, "") where unit != "" + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + filter/include_cluster_metrics: + metrics: + include: + match_type: regexp + metric_names: + - "kafka\\.partition\\.offline" + - "kafka\\.(leader|unclean)\\.election\\.rate" + - "kafka\\.partition\\.non_preferred_leader" + - "kafka\\.broker\\.fenced\\.count" + - "kafka\\.cluster\\.partition\\.count" + - "kafka\\.cluster\\.topic\\.count" + filter/exclude_cluster_metrics: + metrics: + exclude: + match_type: regexp + metric_names: + - "kafka\\.partition\\.offline" + - "kafka\\.(leader|unclean)\\.election\\.rate" + - "kafka\\.partition\\.non_preferred_leader" + - "kafka\\.broker\\.fenced\\.count" + - "kafka\\.cluster\\.partition\\.count" + - "kafka\\.cluster\\.topic\\.count" + cumulativetodelta: + metricstransform/kafka_topic_sum_aggregation: + transforms: + - include: kafka.partition.replicas_in_sync + action: insert + new_name: kafka.partition.replicas_in_sync.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + - include: kafka.partition.replicas + action: insert + new_name: kafka.partition.replicas.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + filter/remove_partition_level_replicas: + metrics: + exclude: + match_type: strict + metric_names: + - kafka.partition.replicas_in_sync + groupbyattrs/cluster: + keys: [kafka.cluster.name] + metricstransform/cluster_max: + transforms: + - include: "kafka\\.partition\\.offline|kafka\\.leader\\.election\\.rate|kafka\\.unclean\\.election\\.rate|kafka\\.partition\\.non_preferred_leader|kafka\\.broker\\.fenced\\.count|kafka\\.cluster\\.partition\\.count|kafka\\.cluster\\.topic\\.count" + match_type: regexp + action: update + operations: + - action: aggregate_labels + aggregation_type: max + label_set: [] + + service: + pipelines: + metrics/broker: + receivers: [otlp, kafkametrics] + processors: + - resource + - filter/exclude_cluster_metrics + - filter/internal_topics + - transform/remove_extra_attributes + - transform/des_units + - cumulativetodelta + - metricstransform/kafka_topic_sum_aggregation + - filter/remove_partition_level_replicas + - batch/aggregation + exporters: [otlp/newrelic] + metrics/cluster: + receivers: [otlp] + processors: + - resource + - filter/include_cluster_metrics + - transform/remove_broker_id + - transform/remove_extra_attributes + - transform/des_units + - cumulativetodelta + - groupbyattrs/cluster + - metricstransform/cluster_max + - batch/aggregation + exporters: [otlp/newrelic] + traces/apps: + receivers: [otlp] + processors: [resource, batch/aggregation] + exporters: [otlp/newrelic] + logs/apps: + receivers: [otlp] + processors: [resource, batch/aggregation] + exporters: [otlp/newrelic] +``` + + **2. Create `collector-deployment.yaml`** - Deployment with ServiceAccount and Service: + +```yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: newrelic/nrdot-collector:latest + command: + - "/nrdot-collector" + - "--config=/conf/otel-collector-config.yaml" + env: + - name: NEW_RELIC_LICENSE_KEY + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_LICENSE_KEY + - name: NEW_RELIC_OTLP_ENDPOINT + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_OTLP_ENDPOINT + - name: GOGC + value: "80" + ports: + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + resources: + limits: + cpu: "1000m" + memory: "1Gi" + requests: + cpu: "200m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /conf + volumes: + - name: config + configMap: + name: otel-collector-config + items: + - key: otel-collector-config.yaml + path: otel-collector-config.yaml +``` + + + + + + Use the community **OpenTelemetry Collector** for vendor-neutral deployment. + + **1. Create `collector-configmap.yaml`** - Same as NRDOT option above (configuration is identical) + + **2. Create `collector-deployment.yaml`** - Only the container image and command differ: + +```yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:latest + command: + - "/otelcol-contrib" + - "--config=/conf/otel-collector-config.yaml" + env: + - name: NEW_RELIC_LICENSE_KEY + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_LICENSE_KEY + - name: NEW_RELIC_OTLP_ENDPOINT + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_OTLP_ENDPOINT + - name: GOGC + value: "80" + ports: + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + resources: + limits: + cpu: "1000m" + memory: "1Gi" + requests: + cpu: "200m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /conf + volumes: + - name: config + configMap: + name: otel-collector-config + items: + - key: otel-collector-config.yaml + path: otel-collector-config.yaml +``` + + + + + +**Deploy the manifests** + +```bash +# Create namespace if it doesn't exist +kubectl create namespace newrelic --dry-run=client -o yaml | kubectl apply -f - + +# Apply JMX ConfigMap to the Kafka namespace +kubectl apply -f kafka-jmx-config.yaml + +# Apply collector ConfigMap +kubectl apply -f collector-configmap.yaml + +# Apply Deployment and Service +kubectl apply -f collector-deployment.yaml +``` + +**Verify the deployment:** + +```bash +# Check pod status +kubectl get pods -n newrelic -l app=otel-collector + +# View logs to verify metrics are being received from broker pods +kubectl logs -n newrelic -l app=otel-collector --tail=50 +``` + + +
+
+ +
+ + + +### Configure Kafka StatefulSet for the Java agent [#configure-statefulset] + +Now that the collector is running, patch your Kafka StatefulSet to add an init container that downloads the OpenTelemetry Java agent JAR, then attach it to the Kafka broker JVM via `KAFKA_OPTS`. + +Add the following sections to your existing Kafka StatefulSet manifest: + +```yaml +spec: + template: + spec: + # 1. Init container: downloads OTel Java agent JAR before Kafka starts + initContainers: + - name: download-otel-agent + image: busybox:latest + command: + - sh + - -c + - | + wget -O /otel-agent/opentelemetry-javaagent.jar \ + https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar + volumeMounts: + - name: otel-agent + mountPath: /otel-agent + + containers: + - name: kafka + # 2. Attach OTel Java agent to the Kafka broker JVM + env: + - name: KAFKA_OPTS + value: >- + -javaagent:/otel-agent/opentelemetry-javaagent.jar + -Dotel.jmx.enabled=true + -Dotel.jmx.config=/jmx-config/kafka-jmx-config.yaml + -Dotel.resource.attributes=kafka.cluster.name=my-kafka-cluster + -Dotel.exporter.otlp.endpoint=http://otel-collector.newrelic.svc.cluster.local:4317 + -Dotel.exporter.otlp.protocol=grpc + -Dotel.metrics.exporter=otlp + -Dotel.instrumentation.runtime-telemetry.enabled=false + -Dotel.metric.export.interval=30000 + volumeMounts: + - name: otel-agent + mountPath: /otel-agent + - name: jmx-config + mountPath: /jmx-config + + # 3. Volumes: emptyDir for JAR, ConfigMap for JMX rules + volumes: + - name: otel-agent + emptyDir: {} + - name: jmx-config + configMap: + name: kafka-jmx-config # Deployed with the collector in the previous step +``` + + + The `kafka-jmx-config` ConfigMap was deployed with the collector in the previous step. The `otel.exporter.otlp.endpoint` value `http://otel-collector.newrelic.svc.cluster.local:4317` assumes the collector is deployed in the `newrelic` namespace with service name `otel-collector`. Update it to match your actual collector service DNS if different. + + +#### Configuration parameters + +The following table describes the key Java agent configuration parameters for Kafka StatefulSet: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`-javaagent`Attaches the OpenTelemetry Java agent to the Kafka broker JVMNone (Required)
`-Dotel.jmx.enabled`Enables JMX metrics collection from the Kafka broker`false`
`-Dotel.jmx.config`Points to your custom JMX metrics configuration file (mounted from ConfigMap)None (Required)
`-Dotel.resource.attributes`Adds `kafka.cluster.name` metadata to all metrics for identification in New RelicNone (Optional)
`-Dotel.exporter.otlp.endpoint`Points to the OpenTelemetry Collector service in your cluster`http://localhost:4317`
`-Dotel.exporter.otlp.protocol`Protocol used for OTLP communication with the collector`grpc`
`-Dotel.metrics.exporter`Specifies the exporter to use for sending metrics`otlp`
`-Dotel.metric.export.interval`Interval in milliseconds for exporting metrics to the collector`60000`
`-Dotel.instrumentation.runtime-telemetry.enabled`Enables or disables JVM runtime telemetry collection`true`
+ +For complete configuration options, see the [Java agent configuration guide](https://opentelemetry.io/docs/zero-code/java/agent/configuration/). + +Apply your updated StatefulSet and wait for pods to roll: + +```bash +kubectl apply -f kafka-statefulset.yaml +kubectl rollout status statefulset/kafka -n kafka +``` + +
+ + + +### (Optional) Instrument producer or consumer applications [#instrument-apps] + + + **Language support**: Currently, only Java applications are supported for Kafka client instrumentation using the OpenTelemetry Java agent. + + +To collect application-level telemetry from your Kafka producer and consumer applications running in Kubernetes, add the OpenTelemetry Java agent to those application pods. + +Add an init container and environment variables to your application's deployment: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kafka-producer-app +spec: + template: + spec: + initContainers: + - name: download-otel-agent + image: busybox:latest + command: + - sh + - -c + - wget -O /otel-agent/opentelemetry-javaagent.jar https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar + volumeMounts: + - name: otel-agent + mountPath: /otel-agent + + containers: + - name: app + image: your-kafka-app:latest + env: + - name: JAVA_TOOL_OPTIONS + value: >- + -javaagent:/otel-agent/opentelemetry-javaagent.jar + -Dotel.service.name=order-process-service + -Dotel.resource.attributes=kafka.cluster.name=my-kafka-cluster + -Dotel.exporter.otlp.endpoint=http://otel-collector.newrelic.svc.cluster.local:4317 + -Dotel.exporter.otlp.protocol=grpc + -Dotel.metrics.exporter=otlp + -Dotel.traces.exporter=otlp + -Dotel.logs.exporter=otlp + -Dotel.instrumentation.kafka.experimental-span-attributes=true + -Dotel.instrumentation.messaging.experimental.receive-telemetry.enabled=true + -Dotel.instrumentation.kafka.producer-propagation.enabled=true + -Dotel.instrumentation.kafka.enabled=true + -Dotel.instrumentation.runtime-telemetry.enabled=false + volumeMounts: + - name: otel-agent + mountPath: /otel-agent + + volumes: + - name: otel-agent + emptyDir: {} +``` + +#### Configuration parameters + +The following table describes the key Java agent configuration parameters for Kafka producer/consumer applications: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`-javaagent`Attaches the OpenTelemetry Java agent to the application JVMNone (Required)
`-Dotel.service.name`Unique name to identify your producer or consumer application in traces`unknown_service:java`
`-Dotel.resource.attributes`Adds `kafka.cluster.name` metadata to correlate with broker metricsNone (Optional)
`-Dotel.exporter.otlp.endpoint`Points to the OpenTelemetry Collector service in your cluster`http://localhost:4317`
`-Dotel.exporter.otlp.protocol`Protocol used for OTLP communication with the collector`grpc`
`-Dotel.metrics.exporter`Exporter for application metrics`otlp`
`-Dotel.traces.exporter`Exporter for distributed traces`otlp`
`-Dotel.logs.exporter`Exporter for application logs`otlp`
`-Dotel.instrumentation.kafka.experimental-span-attributes`Enables experimental Kafka span attributes for enhanced tracing`false`
`-Dotel.instrumentation.messaging.experimental.receive-telemetry.enabled`Enables experimental receive telemetry for messaging systems`false`
`-Dotel.instrumentation.kafka.producer-propagation.enabled`Enables trace context propagation for Kafka producers`true`
`-Dotel.instrumentation.kafka.enabled`Enables Kafka client instrumentation`true`
`-Dotel.instrumentation.runtime-telemetry.enabled`Enables or disables JVM runtime telemetry collection`true`
+ +The Java agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing request latencies, throughput metrics, error rates, and distributed traces. For advanced configuration, see the [Kafka instrumentation documentation](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/kafka). + +
+ +
+ +
+ + + +Follow these steps to set up comprehensive Kafka monitoring. You'll install the Prometheus JMX Exporter on your broker pods and deploy a Collector to gather and send metrics to New Relic. + + + + + +### Before you begin [#prerequisites] + +Ensure you have: + +* A [New Relic account](https://newrelic.com/signup) with a +* A Kubernetes cluster with `kubectl` access +* A Kafka cluster deployed as a StatefulSet with a headless service (for stable pod DNS names) +* Ability to modify and redeploy the Kafka StatefulSet + + + + + + +### Create JMX metrics ConfigMap [#jmx-config] + +Create a ConfigMap containing the JMX Exporter configuration that defines which Kafka metrics to collect. This ConfigMap will be mounted into each Kafka broker pod. + +Save as `kafka-jmx-config.yaml`. Apply it to the namespace where Kafka is deployed: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: kafka-jmx-metrics + namespace: kafka +data: + kafka-metrics-config.yml: | + startDelaySeconds: 0 + lowercaseOutputName: true + lowercaseOutputLabelNames: true + + rules: + # Cluster-level controller metrics + - pattern: 'kafka.controller<>Value' + name: kafka_cluster_topic_count + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_cluster_partition_count + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_broker_fenced_count + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_partition_non_preferred_leader + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_partition_offline + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_controller_active_count + type: GAUGE + + # Broker-level replica metrics + - pattern: 'kafka.server<>Value' + name: kafka_partition_under_min_isr + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_broker_leader_count + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_partition_count + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_partition_under_replicated + type: GAUGE + + - pattern: 'kafka.server<>Count' + name: kafka_isr_operation_count + type: COUNTER + labels: + operation: "shrink" + + - pattern: 'kafka.server<>Count' + name: kafka_isr_operation_count + type: COUNTER + labels: + operation: "expand" + + - pattern: 'kafka.server<>Value' + name: kafka_max_lag + type: GAUGE + + # Broker topic metrics (totals) + - pattern: 'kafka.server<>Count' + name: kafka_message_count + type: COUNTER + + - pattern: 'kafka.server<>Count' + name: kafka_request_count + type: COUNTER + labels: + type: "fetch" + + - pattern: 'kafka.server<>Count' + name: kafka_request_count + type: COUNTER + labels: + type: "produce" + + - pattern: 'kafka.server<>Count' + name: kafka_request_failed + type: COUNTER + labels: + type: "fetch" + + - pattern: 'kafka.server<>Count' + name: kafka_request_failed + type: COUNTER + labels: + type: "produce" + + - pattern: 'kafka.server<>Count' + name: kafka_network_io + type: COUNTER + labels: + direction: "in" + + - pattern: 'kafka.server<>Count' + name: kafka_network_io + type: COUNTER + labels: + direction: "out" + + # Per-topic metrics (only appear after traffic flows) + - pattern: 'kafka.server<>Count' + name: kafka_prod_msg_count + type: COUNTER + labels: + topic: "$1" + + - pattern: 'kafka.server<>Count' + name: kafka_topic_io + type: COUNTER + labels: + topic: "$1" + direction: "in" + + - pattern: 'kafka.server<>Count' + name: kafka_topic_io + type: COUNTER + labels: + topic: "$1" + direction: "out" + + # Request metrics + - pattern: 'kafka.network<>99thPercentile' + name: kafka_request_time_99p + type: GAUGE + labels: + type: "$1" + + - pattern: 'kafka.network<>Value' + name: kafka_request_queue + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_purgatory_size + type: GAUGE + labels: + type: "$1" + + # Controller stats + - pattern: 'kafka.controller<>Count' + name: kafka_leader_election_rate + type: COUNTER + + - pattern: 'kafka.controller<>Count' + name: kafka_unclean_election_rate + type: COUNTER + + # JVM Garbage Collection + - pattern: 'java.lang<>CollectionCount' + name: jvm_gc_collections_count + type: COUNTER + labels: + name: "$1" + + # JVM Memory + - pattern: 'java.langmax' + name: jvm_memory_heap_max + type: GAUGE + + - pattern: 'java.langused' + name: jvm_memory_heap_used + type: GAUGE + + # JVM Threading and System + - pattern: 'java.lang<>ThreadCount' + name: jvm_thread_count + type: GAUGE + + - pattern: 'java.lang<>SystemCpuLoad' + name: jvm_system_cpu_utilization + type: GAUGE + + # Broker uptime + - pattern: 'java.lang<>Uptime' + name: kafka_broker_uptime + type: GAUGE + + # Additional metrics — remove this section to reduce data ingest + + # Request latency: total count, 50th percentile, and average (99p kept above) + - pattern: 'kafka.network<>Count' + name: kafka_request_time_total + type: COUNTER + labels: + type: "$1" + + - pattern: 'kafka.network<>50thPercentile' + name: kafka_request_time_50p + type: GAUGE + labels: + type: "$1" + + - pattern: 'kafka.network<>Mean' + name: kafka_request_time_avg + type: GAUGE + labels: + type: "$1" + + # Log flush metrics + - pattern: 'kafka.log<>Count' + name: kafka_logs_flush_count + type: COUNTER + + - pattern: 'kafka.log<>50thPercentile' + name: kafka_logs_flush_time_50p + type: GAUGE + + - pattern: 'kafka.log<>99thPercentile' + name: kafka_logs_flush_time_99p + type: GAUGE + + # JVM GC elapsed time + - pattern: 'java.lang<>CollectionTime' + name: jvm_gc_collections_elapsed + type: COUNTER + labels: + name: "$1" + + # JVM Memory heap committed + - pattern: 'java.langcommitted' + name: jvm_memory_heap_committed + type: GAUGE + + # JVM class loading + - pattern: 'java.lang<>LoadedClassCount' + name: jvm_class_count + type: GAUGE + + # Additional JVM OS metrics + - pattern: 'java.lang<>SystemLoadAverage' + name: jvm_system_cpu_load_1m + type: GAUGE + + - pattern: 'java.lang<>AvailableProcessors' + name: jvm_cpu_count + type: GAUGE + + - pattern: 'java.lang<>ProcessCpuLoad' + name: jvm_cpu_recent_utilization + type: GAUGE + + - pattern: 'java.lang<>OpenFileDescriptorCount' + name: jvm_file_descriptor_count + type: GAUGE + + # JVM Memory Pool + - pattern: 'java.langused' + name: jvm_memory_pool_used + type: GAUGE + labels: + name: "$1" + + - pattern: 'java.langmax' + name: jvm_memory_pool_max + type: GAUGE + labels: + name: "$1" + + - pattern: 'java.langused' + name: jvm_memory_pool_used_after_last_gc + type: GAUGE + labels: + name: "$1" +``` + +#### Configuration parameters + +The following table describes the key JMX Exporter ConfigMap parameters: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`startDelaySeconds`Delay before starting metrics collection (in seconds)`0`
`lowercaseOutputName`Convert metric names to lowercase`false`
`lowercaseOutputLabelNames`Convert label names to lowercase`false`
`rules.pattern`Regular expression to match JMX bean namesNone (Required)
`rules.name`Prometheus metric name for the matched patternNone (Required)
`rules.type`Prometheus metric type (GAUGE, COUNTER, HISTOGRAM, SUMMARY)`GAUGE`
`rules.labels`Additional labels to add to the metricNone (Optional)
`ConfigMap.name`Name of the ConfigMap containing JMX configuration`kafka-jmx-metrics`
`ConfigMap.namespace`Kubernetes namespace where Kafka cluster is deployed`kafka`
+ + + **Customize metrics**: You can add or modify patterns by referencing the [Prometheus JMX Exporter examples](https://github.com/prometheus/jmx_exporter/tree/main/examples) and [Kafka MBean documentation](https://kafka.apache.org/documentation/#monitoring). + + +Apply the ConfigMap: + +```bash +kubectl apply -f kafka-jmx-config.yaml +``` + +
+ + + +### Configure Kafka StatefulSet for JMX Exporter [#configure-statefulset] + +Patch your Kafka StatefulSet to add an init container that downloads the Prometheus JMX Exporter JAR, then attach it to the Kafka broker JVM via `KAFKA_OPTS`. + +Add the following sections to your existing Kafka StatefulSet manifest: + +```yaml +spec: + template: + spec: + # 1. Init container: downloads JMX Exporter JAR before Kafka starts + initContainers: + - name: download-jmx-exporter + image: busybox:latest + command: + - sh + - -c + - | + # Version 1.5.0 is the minimum required version. Check https://github.com/prometheus/jmx_exporter/releases/latest for newer releases. + JMX_EXPORTER_VERSION="1.5.0" + wget -O /prometheus-jmx/jmx_prometheus_javaagent.jar \ + "https://github.com/prometheus/jmx_exporter/releases/download/${JMX_EXPORTER_VERSION}/jmx_prometheus_javaagent-${JMX_EXPORTER_VERSION}.jar" + volumeMounts: + - name: prometheus-jmx + mountPath: /prometheus-jmx + + containers: + - name: kafka + # 2. Attach JMX Exporter as Java agent on port 9404 + env: + - name: KAFKA_OPTS + value: "-javaagent:/prometheus-jmx/jmx_prometheus_javaagent.jar=9404:/jmx-config/kafka-metrics-config.yml" + # 3. Expose port 9404 for Prometheus scraping + ports: + - name: jmx-metrics + containerPort: 9404 + protocol: TCP + volumeMounts: + - name: prometheus-jmx + mountPath: /prometheus-jmx + - name: jmx-config + mountPath: /jmx-config + + # 4. Volumes: emptyDir for JAR, ConfigMap for metrics config + volumes: + - name: prometheus-jmx + emptyDir: {} + - name: jmx-config + configMap: + name: kafka-jmx-metrics # Must match the ConfigMap name from Step 2 +``` + +#### Configuration parameters + +The following table describes the key JMX Exporter configuration parameters for Kafka StatefulSet: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`JMX_EXPORTER_VERSION`Version of the Prometheus JMX Exporter JAR to download`1.5.0`
`jmx_prometheus_javaagent.jar`JMX Exporter JAR file path within the container`/prometheus-jmx/jmx_prometheus_javaagent.jar`
`KAFKA_OPTS`JVM options to attach JMX Exporter as Java agent with port and configNone (Required)
`containerPort`Port where JMX Exporter exposes Prometheus metrics`9404`
`jmx-config path`Path to JMX metrics configuration file (mounted from ConfigMap)`/jmx-config/kafka-metrics-config.yml`
`prometheus-jmx volume`EmptyDir volume for storing downloaded JMX Exporter JAR`emptyDir: {}`
`jmx-config volume`ConfigMap volume containing JMX metrics configuration`kafka-jmx-metrics`
`download command`Download URL pattern for JMX Exporter releases`https://github.com/prometheus/jmx_exporter/releases/download/`
+ +Apply your updated StatefulSet and wait for pods to roll: + +```bash +kubectl apply -f kafka-statefulset.yaml +kubectl rollout status statefulset/kafka -n kafka +``` + + + The init container approach used here follows standard Kubernetes conventions. No elevated host permissions are needed — the init container runs inside the pod and downloads the JAR into an `emptyDir` volume, which is accessible only to containers within that pod. + + +After the rollout completes, verify that metrics are exposed on each broker pod: + +```bash +# Replace kafka-0 and kafka with your pod name and namespace +kubectl exec -n kafka kafka-0 -- curl -s http://localhost:9404/metrics | grep kafka_ | head -20 +``` + + + **Multi-broker clusters**: The init container and `KAFKA_OPTS` configuration applies to all pods in the StatefulSet automatically. Verify each broker pod exposes metrics after the rollout. + + +
+ + + +### Deploy OpenTelemetry Collector [#deploy-opentelemetry-collector] + +Deploy the OpenTelemetry Collector in your cluster. The Collector scrapes Kafka broker pods using static DNS targets and listens on port `4317` for OTLP data from instrumented applications. + + + **Least-privilege approach**: The default configuration uses static headless service DNS targets and requires no Kubernetes API access — no ClusterRole or RBAC is needed. If your cluster scales dynamically and you want broker pods discovered automatically, an optional Kubernetes pod autodiscovery configuration is provided in each collapser below. Autodiscovery requires a ClusterRole with pod `list`/`watch` permissions. Use static DNS unless you have a specific need for dynamic discovery. + + + + + Helm install (recommended) + Manifest install + + + + + +The Helm installation method is the recommended approach for deploying OpenTelemetry Collector in Kubernetes. + +1. Create New Relic credentials secret + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.nr-data.net:4317' + ``` + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.eu01.nr-data.net:4317' + ``` + + + + + For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol). + + +2. Create values.yaml with collector configuration + +Both NRDOT and OpenTelemetry Collectors use identical configuration. Choose your preferred collector image: + + + + **NRDOT** is New Relic's supported distribution of the OpenTelemetry Collector, providing full New Relic support. For more information, see the [NRDOT Collector GitHub repository](https://github.com/newrelic/nrdot-collector-releases/tree/main/distributions/nrdot-collector). + + Create `values.yaml` with the following content: + +```yaml +# Deployment mode +mode: deployment +replicaCount: 1 + +# Use NRDOT collector image +image: + repository: newrelic/nrdot-collector + tag: "latest" + pullPolicy: Always + +# Service account (no ClusterRole needed for static scraping) +serviceAccount: + create: true + name: otel-collector + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 10001 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +# Resource limits +resources: + requests: + memory: 512Mi + cpu: 250m + limits: + memory: 1Gi + cpu: 500m + +# Load environment variables from secret +extraEnvsFrom: + - secretRef: + name: newrelic-otlp-secret + +# Disable unused default ports +ports: + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + +# OpenTelemetry Collector Configuration +config: + receivers: + # Disable default receivers not needed in NRDOT + jaeger: null + zipkin: null + + # OTLP receiver for application traces, metrics, and logs + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + # Kafka metrics receiver for consumer lag, topic, and partition metrics + kafkametrics/cluster: + brokers: + - "kafka.kafka.svc.cluster.local:9092" + collection_interval: 30s + protocol_version: 2.0.0 + scrapers: + - brokers + - topics + - consumers + topic_match: "^[^_].*$" + metrics: + kafka.topic.min_insync_replicas: + enabled: true + kafka.topic.replication_factor: + enabled: true + kafka.partition.replicas: + enabled: false + kafka.partition.oldest_offset: + enabled: false + kafka.partition.current_offset: + enabled: false + + # Prometheus receiver scrapes JMX metrics from each broker pod via headless service DNS + prometheus/kafka-jmx: + config: + scrape_configs: + - job_name: 'kafka-jmx-metrics' + metrics_path: /metrics + scrape_interval: 30s + static_configs: + - targets: + - 'kafka-0.kafka-headless.kafka.svc.cluster.local:9404' + - 'kafka-1.kafka-headless.kafka.svc.cluster.local:9404' + - 'kafka-2.kafka-headless.kafka.svc.cluster.local:9404' + relabel_configs: + # Extract broker ordinal from pod DNS name as broker.id + - source_labels: [__address__] + target_label: broker.id + regex: '[^-]+-(\d+)\..+:\d+' + replacement: '$1' + + exporters: + otlp/backend: + endpoint: ${NEW_RELIC_OTLP_ENDPOINT} + tls: + insecure: false + sending_queue: + num_consumers: 12 + queue_size: 5000 + retry_on_failure: + enabled: true + headers: + api-key: ${NEW_RELIC_LICENSE_KEY} + + processors: + batch/export: + send_batch_size: 1024 + timeout: 30s + + memory_limiter: + limit_percentage: 80 + spike_limit_percentage: 30 + check_interval: 1s + + transform/metric-naming: + metric_statements: + - context: metric + statements: + - replace_pattern(name, "_", ".") + - replace_pattern(name, "\\.load\\.1", ".load_1") + - replace_pattern(name, "\\.recent\\.util", ".recent_util") + - replace_pattern(name, "file\\.descriptor\\.count", "file_descriptor.count") + - replace_pattern(name, "\\.memory\\.pool\\.used\\.bytes$", ".memory.pool.used") + - replace_pattern(name, "\\.memory\\.pool\\.max\\.bytes$", ".memory.pool.max") + - replace_pattern(name, "\\.memory\\.pool\\.collection\\.used\\.bytes$", ".memory.pool.used_after_last_gc") + - replace_pattern(name, "\\.non\\.preferred\\.leader", ".non_preferred_leader") + - replace_pattern(name, "\\.under\\.min\\.isr", ".under_min_isr") + - replace_pattern(name, "\\.under\\.replicated", ".under_replicated") + - replace_pattern(name, "\\.total$", "") where name != "kafka.request.time.total" + - context: datapoint + statements: + - set(attributes["name"], attributes["gc"]) where attributes["gc"] != nil + - delete_key(attributes, "gc") where attributes["gc"] != nil + - set(attributes["name"], attributes["pool"]) where attributes["pool"] != nil + - delete_key(attributes, "pool") where attributes["pool"] != nil + + resource/cluster-name: + attributes: + - key: kafka.cluster.name + value: my-kafka-cluster + action: upsert + + transform/remove_broker_id: + metric_statements: + - context: datapoint + statements: + - delete_key(attributes, "broker.id") + + filter/scrape-overhead: + metrics: + exclude: + match_type: regexp + metric_names: + - "^jmx_.*" + - "^process_.*" + - "^jvm_buffer_pool_.*" + - "^jvm_threads_.*" + - "^jvm_classes_.*" + - "^jvm_memory_(heap|non_heap)_(committed|init|max|used)_bytes$" + - "^jvm_compilation_.*" + - "^jvm_(runtime|info).*" + - "^jvm_memory_pool_(allocated_bytes_total|committed_bytes|init_bytes|collection_(committed|init|max)_bytes)$" + + filter/include_cluster_metrics: + metrics: + include: + match_type: regexp + metric_names: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + + filter/exclude_cluster_metrics: + metrics: + exclude: + match_type: regexp + metric_names: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + + transform/remove_attributes: + metric_statements: + - context: metric + statements: + - set(description, "") where description != "" + - set(unit, "") where unit != "" + - context: resource + statements: + - delete_key(attributes, "server.address") + - delete_key(attributes, "server.port") + - delete_key(attributes, "service.instance.id") + - delete_key(attributes, "host.name") + - delete_key(attributes, "k8s.pod.uid") + - delete_key(attributes, "url.scheme") + + metricstransform/topic-aggregation: + transforms: + - include: kafka.partition.replicas_in_sync + action: insert + new_name: kafka.partition.replicas_in_sync.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + - include: kafka.partition.replicas + action: insert + new_name: kafka.partition.replicas.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + + filter/exclude_partition_replicas_metric: + metrics: + exclude: + match_type: strict + metric_names: + - kafka.partition.replicas_in_sync + + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + + cumulativetodelta: + + groupbyattrs/cluster: + keys: [kafka.cluster.name] + + metricstransform/cluster_max: + transforms: + - include: "kafka\\.partition\\.offline|kafka\\.leader\\.election\\.rate|kafka\\.unclean\\.election\\.rate|kafka\\.partition\\.non_preferred_leader|kafka\\.broker\\.fenced\\.count|kafka\\.cluster\\.partition\\.count|kafka\\.cluster\\.topic\\.count" + match_type: regexp + action: update + operations: + - action: aggregate_labels + aggregation_type: max + label_set: [] + + service: + pipelines: + # Application traces from instrumented Kafka clients and apps + traces: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + + # Application metrics from instrumented Kafka clients and apps + metrics: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + + # Application logs from instrumented Kafka clients and apps + logs: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + + # Broker-level metrics from Prometheus JMX scraping + metrics/broker: + receivers: + - prometheus/kafka-jmx + processors: + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes + - filter/exclude_cluster_metrics + - memory_limiter + - cumulativetodelta + - batch/export + exporters: + - otlp/backend + + # Cluster-level metrics from Prometheus JMX scraping + metrics/cluster/prometheus: + receivers: + - prometheus/kafka-jmx + processors: + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes + - filter/include_cluster_metrics + - transform/remove_broker_id + - memory_limiter + - cumulativetodelta + - groupbyattrs/cluster + - metricstransform/cluster_max + - batch/export + exporters: + - otlp/backend + + # Cluster-level metrics from Kafka metrics receiver (consumer lag, topics, partitions) + metrics/cluster/kafkametrics: + receivers: + - kafkametrics/cluster + processors: + - resource/cluster-name + - filter/internal_topics + - transform/remove_attributes + - metricstransform/topic-aggregation + - filter/exclude_partition_replicas_metric + - memory_limiter + - cumulativetodelta + - batch/export + exporters: + - otlp/backend +``` + +#### Configuration parameters + +The following table describes the key configuration parameters for Kafka monitoring with Prometheus JMX Exporter: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`NEW_RELIC_OTLP_ENDPOINT`New Relic OTLP endpoint for your regionNone (Required)
`NEW_RELIC_LICENSE_KEY`Your New Relic license key for authenticationNone (Required)
`kafkametrics.brokers`Kafka bootstrap service DNS for consumer lag metrics`kafka.kafka.svc.cluster.local:9092`
`prometheus.job_name`Job name for Prometheus scraping configuration`kafka-jmx-metrics`
`prometheus.targets`List of broker pod DNS names with JMX Exporter port`...svc.cluster.local:9404`
`kafka.cluster.name`Label value to identify your Kafka cluster in New Relic`my-kafka-cluster`
`resource.kafka.cluster.name`Resource processor cluster name for metric identification`my-kafka-cluster`
`replicaCount`Number of collector replicas to deploy`1`
`resources.requests.memory`Minimum memory allocation for the collector`512Mi`
`resources.requests.cpu`Minimum CPU allocation for the collector`250m`
`resources.limits.memory`Maximum memory limit for the collector`1Gi`
`resources.limits.cpu`Maximum CPU limit for the collector`500m`
+ + + + **Alternative: Kubernetes pod autodiscovery** + + Instead of static DNS targets, you can use Kubernetes pod discovery to automatically find broker pods. This is useful for dynamic scaling without needing to update the target list. + + Replace the `clusterRole` and `prometheus/kafka-jmx` sections in `values.yaml` with: + + ```yaml + # Add RBAC for Kubernetes pod discovery + clusterRole: + create: true + rules: + - apiGroups: [""] + resources: ["pods", "nodes"] + verbs: ["get", "list", "watch"] + + # In config.receivers: + prometheus/kafka-jmx: + config: + scrape_configs: + - job_name: 'kafka-jmx-metrics' + metrics_path: /metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kafka + relabel_configs: + # Filter for Kafka broker pods by app label + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: kafka + + # Only scrape running pods + - source_labels: [__meta_kubernetes_pod_phase] + action: keep + regex: Running + + # Extract broker ordinal from pod name as broker.id + - source_labels: [__meta_kubernetes_pod_name] + target_label: broker.id + regex: '.*-(\d+)$' + replacement: '$1' + + # Set scrape target to pod IP on port 9404 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: '$1:9404' + ``` + + +
+ + + Use the community **OpenTelemetry Collector** for maximum flexibility and vendor-neutral deployment. + + Create `values.yaml` with the following content (identical configuration, different image): + +```yaml +# Deployment mode +mode: deployment +replicaCount: 1 + +# Use contrib image for kafkametrics receiver +image: + repository: otel/opentelemetry-collector-contrib + tag: "latest" + pullPolicy: Always + +# Service account (no ClusterRole needed for static scraping) +serviceAccount: + create: true + name: otel-collector + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 10001 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +# Resource limits +resources: + requests: + memory: 512Mi + cpu: 250m + limits: + memory: 1Gi + cpu: 500m + +# Load environment variables from secret +extraEnvsFrom: + - secretRef: + name: newrelic-otlp-secret + +# Disable unused default ports +ports: + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + +# OpenTelemetry Collector Configuration +config: + receivers: + # OTLP receiver for application traces, metrics, and logs + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + # Kafka metrics receiver for consumer lag, topic, and partition metrics + kafkametrics/cluster: + brokers: + - "kafka.kafka.svc.cluster.local:9092" + collection_interval: 30s + protocol_version: 2.0.0 + scrapers: + - brokers + - topics + - consumers + topic_match: "^[^_].*$" + metrics: + kafka.topic.min_insync_replicas: + enabled: true + kafka.topic.replication_factor: + enabled: true + kafka.partition.replicas: + enabled: false + kafka.partition.oldest_offset: + enabled: false + kafka.partition.current_offset: + enabled: false + + # Prometheus receiver scrapes JMX metrics from each broker pod via headless service DNS + prometheus/kafka-jmx: + config: + scrape_configs: + - job_name: 'kafka-jmx-metrics' + metrics_path: /metrics + scrape_interval: 30s + static_configs: + - targets: + - 'kafka-0.kafka-headless.kafka.svc.cluster.local:9404' + - 'kafka-1.kafka-headless.kafka.svc.cluster.local:9404' + - 'kafka-2.kafka-headless.kafka.svc.cluster.local:9404' + relabel_configs: + - source_labels: [__address__] + target_label: broker.id + regex: '[^-]+-(\d+)\..+:\d+' + replacement: '$1' + + exporters: + otlp/backend: + endpoint: ${NEW_RELIC_OTLP_ENDPOINT} + tls: + insecure: false + sending_queue: + num_consumers: 12 + queue_size: 5000 + retry_on_failure: + enabled: true + headers: + api-key: ${NEW_RELIC_LICENSE_KEY} + + processors: + batch/export: + send_batch_size: 1024 + timeout: 30s + memory_limiter: + limit_percentage: 80 + spike_limit_percentage: 30 + check_interval: 1s + transform/metric-naming: + metric_statements: + - context: metric + statements: + - replace_pattern(name, "_", ".") + - replace_pattern(name, "\\.load\\.1", ".load_1") + - replace_pattern(name, "\\.recent\\.util", ".recent_util") + - replace_pattern(name, "file\\.descriptor\\.count", "file_descriptor.count") + - replace_pattern(name, "\\.memory\\.pool\\.used\\.bytes$", ".memory.pool.used") + - replace_pattern(name, "\\.memory\\.pool\\.max\\.bytes$", ".memory.pool.max") + - replace_pattern(name, "\\.memory\\.pool\\.collection\\.used\\.bytes$", ".memory.pool.used_after_last_gc") + - replace_pattern(name, "\\.non\\.preferred\\.leader", ".non_preferred_leader") + - replace_pattern(name, "\\.under\\.min\\.isr", ".under_min_isr") + - replace_pattern(name, "\\.under\\.replicated", ".under_replicated") + - replace_pattern(name, "\\.total$", "") where name != "kafka.request.time.total" + - context: datapoint + statements: + - set(attributes["name"], attributes["gc"]) where attributes["gc"] != nil + - delete_key(attributes, "gc") where attributes["gc"] != nil + - set(attributes["name"], attributes["pool"]) where attributes["pool"] != nil + - delete_key(attributes, "pool") where attributes["pool"] != nil + resource/cluster-name: + attributes: + - key: kafka.cluster.name + value: my-kafka-cluster + action: upsert + transform/remove_broker_id: + metric_statements: + - context: datapoint + statements: + - delete_key(attributes, "broker.id") + filter/scrape-overhead: + metrics: + exclude: + match_type: regexp + metric_names: + - "^jmx_.*" + - "^process_.*" + - "^jvm_buffer_pool_.*" + - "^jvm_threads_.*" + - "^jvm_classes_.*" + - "^jvm_memory_(heap|non_heap)_(committed|init|max|used)_bytes$" + - "^jvm_compilation_.*" + - "^jvm_(runtime|info).*" + - "^jvm_memory_pool_(allocated_bytes_total|committed_bytes|init_bytes|collection_(committed|init|max)_bytes)$" + filter/include_cluster_metrics: + metrics: + include: + match_type: regexp + metric_names: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + filter/exclude_cluster_metrics: + metrics: + exclude: + match_type: regexp + metric_names: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + transform/remove_attributes: + metric_statements: + - context: metric + statements: + - set(description, "") where description != "" + - set(unit, "") where unit != "" + - context: resource + statements: + - delete_key(attributes, "server.address") + - delete_key(attributes, "server.port") + - delete_key(attributes, "service.instance.id") + - delete_key(attributes, "host.name") + - delete_key(attributes, "k8s.pod.uid") + - delete_key(attributes, "url.scheme") + metricstransform/topic-aggregation: + transforms: + - include: kafka.partition.replicas_in_sync + action: insert + new_name: kafka.partition.replicas_in_sync.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + - include: kafka.partition.replicas + action: insert + new_name: kafka.partition.replicas.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + filter/exclude_partition_replicas_metric: + metrics: + exclude: + match_type: strict + metric_names: + - kafka.partition.replicas_in_sync + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + cumulativetodelta: + groupbyattrs/cluster: + keys: [kafka.cluster.name] + metricstransform/cluster_max: + transforms: + - include: "kafka\\.partition\\.offline|kafka\\.leader\\.election\\.rate|kafka\\.unclean\\.election\\.rate|kafka\\.partition\\.non_preferred_leader|kafka\\.broker\\.fenced\\.count|kafka\\.cluster\\.partition\\.count|kafka\\.cluster\\.topic\\.count" + match_type: regexp + action: update + operations: + - action: aggregate_labels + aggregation_type: max + label_set: [] + + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + logs: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + metrics/broker: + receivers: + - prometheus/kafka-jmx + processors: + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes + - filter/exclude_cluster_metrics + - memory_limiter + - cumulativetodelta + - batch/export + exporters: + - otlp/backend + metrics/cluster/prometheus: + receivers: + - prometheus/kafka-jmx + processors: + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes + - filter/include_cluster_metrics + - transform/remove_broker_id + - memory_limiter + - cumulativetodelta + - groupbyattrs/cluster + - metricstransform/cluster_max + - batch/export + exporters: + - otlp/backend + metrics/cluster/kafkametrics: + receivers: + - kafkametrics/cluster + processors: + - resource/cluster-name + - filter/internal_topics + - transform/remove_attributes + - metricstransform/topic-aggregation + - filter/exclude_partition_replicas_metric + - memory_limiter + - cumulativetodelta + - batch/export + exporters: + - otlp/backend +``` + + + + **Alternative: Kubernetes pod autodiscovery** + + Instead of static DNS targets, you can use Kubernetes pod discovery to automatically find broker pods. This is useful for dynamic scaling without needing to update the target list. + + Replace the `prometheus/kafka-jmx` section in `values.yaml` with: + + ```yaml + # Add RBAC for Kubernetes pod discovery (add before config:) + clusterRole: + create: true + rules: + - apiGroups: [""] + resources: ["pods", "nodes"] + verbs: ["get", "list", "watch"] + + # In config.receivers: + prometheus/kafka-jmx: + config: + scrape_configs: + - job_name: 'kafka-jmx-metrics' + metrics_path: /metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kafka + relabel_configs: + # Filter for Kafka broker pods by app label + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: kafka + + # Only scrape running pods + - source_labels: [__meta_kubernetes_pod_phase] + action: keep + regex: Running + + # Extract broker ordinal from pod name as broker.id + - source_labels: [__meta_kubernetes_pod_name] + target_label: broker.id + regex: '.*-(\d+)$' + replacement: '$1' + + # Set scrape target to pod IP on port 9404 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: '$1:9404' + ``` + + + +
+ +For advanced configuration options, refer to these receiver documentation pages: +- [Prometheus receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) +- [Kafka metrics receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kafkametricsreceiver) + +3. Install OpenTelemetry Collector with Helm + +```bash +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm upgrade kafka-monitoring open-telemetry/opentelemetry-collector \ + --install \ + --namespace newrelic \ + --create-namespace \ + -f values.yaml +``` + +4. Verify the deployment: + +```bash +# Check pod status +kubectl get pods -n newrelic -l app.kubernetes.io/name=opentelemetry-collector + +# View logs to verify metrics collection +kubectl logs -n newrelic -l app.kubernetes.io/name=opentelemetry-collector --tail=50 +``` + +You should see logs indicating successful scraping from Kafka broker pods on port `9404`. + +
+ + + +The manifest installation method provides direct control over Kubernetes resources without using Helm. + +1. Create New Relic credentials secret + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.nr-data.net:4317' + ``` + + + + ```bash + kubectl create secret generic newrelic-otlp-secret \ + --namespace newrelic \ + --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.eu01.nr-data.net:4317' + ``` + + + + + For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol). + + +2. Create manifest files + +Both NRDOT and OpenTelemetry Collectors use identical configuration. Only the container image differs. + + + + **NRDOT** is New Relic's supported distribution of the OpenTelemetry Collector, providing full New Relic support. For more information, see the [NRDOT Collector GitHub repository](https://github.com/newrelic/nrdot-collector-releases/tree/main/distributions/nrdot-collector). + + **1. Create `collector-configmap.yaml`** - OpenTelemetry Collector configuration: + +```yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: newrelic + labels: + app: otel-collector +data: + otel-collector-config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + kafkametrics/cluster: + brokers: + - "kafka.kafka.svc.cluster.local:9092" + collection_interval: 30s + protocol_version: 2.0.0 + scrapers: + - brokers + - topics + - consumers + topic_match: "^[^_].*$" + metrics: + kafka.topic.min_insync_replicas: + enabled: true + kafka.topic.replication_factor: + enabled: true + kafka.partition.replicas: + enabled: false + kafka.partition.oldest_offset: + enabled: false + kafka.partition.current_offset: + enabled: false + + prometheus/kafka-jmx: + config: + scrape_configs: + - job_name: 'kafka-jmx-metrics' + metrics_path: /metrics + scrape_interval: 30s + static_configs: + - targets: + - 'kafka-0.kafka-headless.kafka.svc.cluster.local:9404' + - 'kafka-1.kafka-headless.kafka.svc.cluster.local:9404' + - 'kafka-2.kafka-headless.kafka.svc.cluster.local:9404' + labels: + kafka.cluster.name: 'my-kafka-cluster' + relabel_configs: + - source_labels: [__address__] + target_label: broker.id + regex: '[^-]+-(\d+)\..+:\d+' + replacement: '$1' + + exporters: + otlp/backend: + endpoint: ${NEW_RELIC_OTLP_ENDPOINT} + tls: + insecure: false + sending_queue: + num_consumers: 12 + queue_size: 5000 + retry_on_failure: + enabled: true + headers: + api-key: ${NEW_RELIC_LICENSE_KEY} + + processors: + batch/export: + send_batch_size: 1024 + timeout: 30s + memory_limiter: + limit_percentage: 80 + spike_limit_percentage: 30 + check_interval: 1s + transform/metric-naming: + metric_statements: + - context: metric + statements: + - replace_pattern(name, "_", ".") + - replace_pattern(name, "\\.load\\.1", ".load_1") + - replace_pattern(name, "\\.recent\\.util", ".recent_util") + - replace_pattern(name, "file\\.descriptor\\.count", "file_descriptor.count") + - replace_pattern(name, "\\.memory\\.pool\\.used\\.bytes$", ".memory.pool.used") + - replace_pattern(name, "\\.memory\\.pool\\.max\\.bytes$", ".memory.pool.max") + - replace_pattern(name, "\\.memory\\.pool\\.collection\\.used\\.bytes$", ".memory.pool.used_after_last_gc") + - replace_pattern(name, "\\.non\\.preferred\\.leader", ".non_preferred_leader") + - replace_pattern(name, "\\.under\\.min\\.isr", ".under_min_isr") + - replace_pattern(name, "\\.under\\.replicated", ".under_replicated") + - replace_pattern(name, "\\.total$", "") where name != "kafka.request.time.total" + - context: datapoint + statements: + - set(attributes["name"], attributes["gc"]) where attributes["gc"] != nil + - delete_key(attributes, "gc") where attributes["gc"] != nil + - set(attributes["name"], attributes["pool"]) where attributes["pool"] != nil + - delete_key(attributes, "pool") where attributes["pool"] != nil + resource/cluster-name: + attributes: + - key: kafka.cluster.name + value: my-kafka-cluster + action: upsert + transform/remove_broker_id: + metric_statements: + - context: datapoint + statements: + - delete_key(attributes, "broker.id") + filter/scrape-overhead: + metrics: + exclude: + match_type: regexp + metric_names: + - "^jmx_.*" + - "^process_.*" + - "^jvm_buffer_pool_.*" + - "^jvm_threads_.*" + - "^jvm_classes_.*" + - "^jvm_memory_(heap|non_heap)_(committed|init|max|used)_bytes$" + - "^jvm_compilation_.*" + - "^jvm_(runtime|info).*" + - "^jvm_memory_pool_(allocated_bytes_total|committed_bytes|init_bytes|collection_(committed|init|max)_bytes)$" + filter/include_cluster_metrics: + metrics: + include: + match_type: regexp + metric_names: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + filter/exclude_cluster_metrics: + metrics: + exclude: + match_type: regexp + metric_names: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + transform/remove_attributes: + metric_statements: + - context: metric + statements: + - set(description, "") where description != "" + - set(unit, "") where unit != "" + - context: resource + statements: + - delete_key(attributes, "server.address") + - delete_key(attributes, "server.port") + - delete_key(attributes, "service.instance.id") + - delete_key(attributes, "host.name") + - delete_key(attributes, "k8s.pod.uid") + - delete_key(attributes, "url.scheme") + metricstransform/topic-aggregation: + transforms: + - include: kafka.partition.replicas_in_sync + action: insert + new_name: kafka.partition.replicas_in_sync.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + - include: kafka.partition.replicas + action: insert + new_name: kafka.partition.replicas.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + filter/exclude_partition_replicas_metric: + metrics: + exclude: + match_type: strict + metric_names: + - kafka.partition.replicas_in_sync + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + cumulativetodelta: + groupbyattrs/cluster: + keys: [kafka.cluster.name] + metricstransform/cluster_max: + transforms: + - include: "kafka\\.partition\\.offline|kafka\\.leader\\.election\\.rate|kafka\\.unclean\\.election\\.rate|kafka\\.partition\\.non_preferred_leader|kafka\\.broker\\.fenced\\.count|kafka\\.cluster\\.partition\\.count|kafka\\.cluster\\.topic\\.count" + match_type: regexp + action: update + operations: + - action: aggregate_labels + aggregation_type: max + label_set: [] + + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + logs: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + metrics/broker: + receivers: [prometheus/kafka-jmx] + processors: + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes + - filter/exclude_cluster_metrics + - memory_limiter + - cumulativetodelta + - batch/export + exporters: [otlp/backend] + metrics/cluster/prometheus: + receivers: [prometheus/kafka-jmx] + processors: + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes + - filter/include_cluster_metrics + - transform/remove_broker_id + - memory_limiter + - cumulativetodelta + - groupbyattrs/cluster + - metricstransform/cluster_max + - batch/export + exporters: [otlp/backend] + metrics/cluster/kafkametrics: + receivers: [kafkametrics/cluster] + processors: + - resource/cluster-name + - filter/internal_topics + - transform/remove_attributes + - metricstransform/topic-aggregation + - filter/exclude_partition_replicas_metric + - memory_limiter + - cumulativetodelta + - batch/export + exporters: [otlp/backend] +``` + + **2. Create `collector-deployment.yaml`** - OpenTelemetry Collector deployment with ServiceAccount: + +```yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: newrelic/nrdot-collector:latest + command: + - "/nrdot-collector" + - "--config=/conf/otel-collector-config.yaml" + env: + - name: NEW_RELIC_LICENSE_KEY + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_LICENSE_KEY + - name: NEW_RELIC_OTLP_ENDPOINT + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_OTLP_ENDPOINT + - name: GOGC + value: "80" + ports: + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + resources: + limits: + cpu: "1000m" + memory: "1Gi" + requests: + cpu: "200m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /conf + volumes: + - name: config + configMap: + name: otel-collector-config + items: + - key: otel-collector-config.yaml + path: otel-collector-config.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP +``` + + + + + + Use the community **OpenTelemetry Collector** for vendor-neutral deployment. + + **1. Create `collector-configmap.yaml`** - Same as NRDOT option above (configuration is identical) + + **2. Create `collector-deployment.yaml`** - Only the container image differs: + +```yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:latest + command: + - "/otelcol-contrib" + - "--config=/conf/otel-collector-config.yaml" + env: + - name: NEW_RELIC_LICENSE_KEY + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_LICENSE_KEY + - name: NEW_RELIC_OTLP_ENDPOINT + valueFrom: + secretKeyRef: + name: newrelic-otlp-secret + key: NEW_RELIC_OTLP_ENDPOINT + - name: GOGC + value: "80" + ports: + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + resources: + limits: + cpu: "1000m" + memory: "1Gi" + requests: + cpu: "200m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /conf + volumes: + - name: config + configMap: + name: otel-collector-config + items: + - key: otel-collector-config.yaml + path: otel-collector-config.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: newrelic + labels: + app: otel-collector +spec: + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP +``` + + + + + +For advanced configuration options, refer to these receiver documentation pages: +- [Prometheus receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) +- [Kafka metrics receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kafkametricsreceiver) + +3. Deploy the manifests + +```bash +# Create namespace if it doesn't exist +kubectl create namespace newrelic --dry-run=client -o yaml | kubectl apply -f - + +# Apply ConfigMap +kubectl apply -f collector-configmap.yaml + +# Apply Deployment (includes ServiceAccount) +kubectl apply -f collector-deployment.yaml +``` + +4. Verify the deployment: + +```bash +# Check pod status +kubectl get pods -n newrelic -l app=otel-collector + +# View logs to verify metrics collection +kubectl logs -n newrelic -l app=otel-collector --tail=50 +``` + +You should see logs indicating successful scraping from Kafka broker pods on port `9404`. + + +
+
+ +
+ + + +### (Optional) Instrument producer or consumer applications [#instrument-apps] + + + **Language support**: Java applications support out-of-the-box Kafka client instrumentation using the OpenTelemetry Java agent. + + +To collect application-level telemetry from your Kafka producer and consumer applications, use the OpenTelemetry Java agent with an init container: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kafka-producer-app +spec: + template: + spec: + initContainers: + - name: download-java-agent + image: busybox:latest + command: + - sh + - -c + - | + wget -O /otel-auto-instrumentation/opentelemetry-javaagent.jar \ + https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar + volumeMounts: + - name: otel-auto-instrumentation + mountPath: /otel-auto-instrumentation + + containers: + - name: app + image: your-kafka-app:latest + env: + - name: JAVA_TOOL_OPTIONS + value: >- + -javaagent:/otel-auto-instrumentation/opentelemetry-javaagent.jar + -Dotel.service.name=my-kafka-app + -Dotel.resource.attributes=kafka.cluster.name=my-kafka-cluster + -Dotel.exporter.otlp.endpoint=http://otel-collector.newrelic.svc.cluster.local:4317 + -Dotel.exporter.otlp.protocol=grpc + -Dotel.metrics.exporter=otlp + -Dotel.traces.exporter=otlp + -Dotel.logs.exporter=otlp + -Dotel.instrumentation.kafka.experimental-span-attributes=true + -Dotel.instrumentation.messaging.experimental.receive-telemetry.enabled=true + -Dotel.instrumentation.kafka.producer-propagation.enabled=true + -Dotel.instrumentation.kafka.enabled=true + -Dotel.instrumentation.runtime-telemetry.enabled=false + volumeMounts: + - name: otel-auto-instrumentation + mountPath: /otel-auto-instrumentation + + volumes: + - name: otel-auto-instrumentation + emptyDir: {} +``` + +#### Configuration parameters + +The following table describes the key Java agent configuration parameters for Kafka producer/consumer applications: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`-javaagent`Attaches the OpenTelemetry Java agent to the application JVMNone (Required)
`-Dotel.service.name`Unique name to identify your producer or consumer application in traces`unknown_service:java`
`-Dotel.resource.attributes`Adds `kafka.cluster.name` metadata to correlate with broker metricsNone (Optional)
`-Dotel.exporter.otlp.endpoint`Points to the OpenTelemetry Collector service in your cluster`http://localhost:4317`
`-Dotel.exporter.otlp.protocol`Protocol used for OTLP communication with the collector`grpc`
`-Dotel.metrics.exporter`Exporter for application metrics`otlp`
`-Dotel.traces.exporter`Exporter for distributed traces`otlp`
`-Dotel.logs.exporter`Exporter for application logs`otlp`
`-Dotel.instrumentation.kafka.experimental-span-attributes`Enables experimental Kafka span attributes for enhanced tracing`false`
`-Dotel.instrumentation.messaging.experimental.receive-telemetry.enabled`Enables experimental receive telemetry for messaging systems`false`
`-Dotel.instrumentation.kafka.producer-propagation.enabled`Enables trace context propagation for Kafka producers`true`
`-Dotel.instrumentation.kafka.enabled`Enables Kafka client instrumentation`true`
`-Dotel.instrumentation.runtime-telemetry.enabled`Enables or disables JVM runtime telemetry collection`true`
+ + +The Java agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing request latencies, throughput metrics, error rates, and distributed traces. For advanced configuration, see the [Kafka instrumentation documentation](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/kafka). + +
+ + + +### (Optional) Forward Kafka broker logs [#forward-logs] + +To collect Kafka broker logs and send them to New Relic, add a filelog receiver to your collector configuration. + + + + Add the following to your `values.yaml` collector configuration: + + 1. Add to receivers section: + ```yaml + receivers: + # ... existing receivers ... + + # File log receiver for Kafka broker logs + filelog/kafka_broker_0: + include: + - /var/log/kafka/server.log + start_at: end + multiline: + line_start_pattern: '^\[' + resource: + broker.id: "0" + kafka.cluster.name: ${env:KAFKA_CLUSTER_NAME} + ``` + + 2. Add logs pipeline to service section: + ```yaml + service: + pipelines: + # ... existing pipelines ... + + logs/broker: + receivers: [filelog/kafka_broker_0] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + ``` + +#### Configuration parameters + +The following table describes the key filelog receiver configuration parameters for Kafka broker log collection: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterDescriptionDefault Value
`include`File path(s) to Kafka broker log files within the container`/var/log/kafka/server.log`
`start_at`Where to start reading when first encountering a file`end`
`multiline.line_start_pattern`Regex pattern to identify the start of a new log entry`^\[`
`resource.broker.id`Broker ID to correlate logs with specific broker metrics`"0"`
`resource.kafka.cluster.name`Cluster name to group logs with broker metrics in New Relic`${env:KAFKA_CLUSTER_NAME}`
`receivers`List of filelog receivers in the logs pipeline`[filelog/kafka_broker_0]`
`processors`Processing steps for log entries before export`[memory_limiter, batch/export]`
`exporters`Where to send the collected log data`[otlp/backend]`
+ + **Configuration notes:** + * Update `/var/log/kafka/server.log` to your actual Kafka log path inside the broker pod + * The `broker.id` resource attribute correlates logs with specific broker metrics and entities + * For multiple brokers, create separate `filelog` receivers (e.g., `filelog/kafka_broker_1`, `filelog/kafka_broker_2`) with their respective broker IDs + * The `multiline` pattern assumes logs start with `[` — adjust if your log format differs + * Consider log volume and collection costs before enabling log forwarding + * For complete configuration options, see the [filelog receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) + + After updating `values.yaml`, upgrade the Helm release: + + ```bash + helm upgrade kafka-otel-collector open-telemetry/opentelemetry-collector \ + --namespace newrelic \ + --values values.yaml + ``` +
+ + + Your Kafka broker logs will appear in two places: + * **Broker entities**: Navigate to the Kafka broker entity in New Relic to see logs correlated with that specific broker + * **Logs UI**: Query all Kafka logs using the [Logs UI](/docs/logs/ui-data/use-logs-ui/) with filters like `kafka.cluster.name = 'my-cluster'` + + You can also query your logs with NRQL: + + ```sql + FROM Log SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' + ``` + +
+ +
+ +
+ +
+
+
+ +## Find your data [#find-data] + +After a few minutes, your Kafka data should appear in New Relic. See [Find your data](/docs/opentelemetry/integrations/kafka/find-and-query-data) for detailed instructions on exploring your Kafka data across different views in the New Relic UI. + +**Metrics** + +Broker, topic, partition, consumer group, and JVM metrics are stored in the `Metric` event type. Replace `my-kafka-cluster` with your `KAFKA_CLUSTER_NAME` value: + +```sql +FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` + + +**Logs** + +Application logs from producer and consumer services instrumented with the OpenTelemetry Java agent are stored in the `Log` event type: + +```sql +FROM Log SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` + +**Traces** + +Producer and consumer spans, including per-message `publish` and `receive` operations across topics, are stored in the `Span` event type: + +```sql +FROM Span SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` + + +## Example [#example] + +A complete working example with Kafka StatefulSet manifests, Helm values, OTel Collector configuration, and sample producer/consumer applications is available in the [New Relic OpenTelemetry Examples repository](https://github.com/newrelic/newrelic-opentelemetry-examples/tree/main/other-examples/collector/kafka/k8s-self-managed). + +## Troubleshooting [#troubleshooting] + + + + Run these commands first to verify your setup. Use the results to identify which specific troubleshooting section to follow. + + **Check if collector pod is running**: + + For manifest installs: + ```bash + kubectl get pods -n newrelic -l app=otel-collector + kubectl logs -n newrelic -l app=otel-collector --tail=50 + ``` + + For Helm installs (`helm upgrade ... kafka-monitoring`): + ```bash + kubectl get pods -n newrelic -l app.kubernetes.io/name=opentelemetry-collector + kubectl logs -n newrelic -l app.kubernetes.io/name=opentelemetry-collector --tail=50 + ``` + + **Check if Kafka broker pods are running with the Java agent**: + ```bash + # List broker pods + kubectl get pods -n kafka -l app=kafka + + # Check env vars on a broker pod (should see KAFKA_OPTS with javaagent) + kubectl exec -n kafka kafka-0 -- env | grep KAFKA_OPTS + + # Check if init container completed successfully + kubectl describe pod -n kafka kafka-0 | grep -A5 "Init Containers" + ``` + + **Verify the otel-agent volume is populated**: + ```bash + kubectl exec -n kafka kafka-0 -- ls -lh /otel-agent/ + ``` + + **Test connectivity from broker pod to collector service**: + ```bash + kubectl exec -n kafka kafka-0 -- nc -zv otel-collector.newrelic.svc.cluster.local 4317 && echo "Port reachable" || echo "Cannot reach collector" + ``` + + + + **Enable collector debug logs**: Add detailed logging to troubleshoot configuration issues. + + In your ConfigMap (`collector-configmap.yaml`), add to the `service` section: + ```yaml + service: + telemetry: + logs: + level: "debug" + ``` + + Then apply the updated ConfigMap and restart the collector deployment: + ```bash + kubectl apply -f collector-configmap.yaml + kubectl rollout restart deployment/otel-collector -n newrelic + ``` + + **Add debug exporter**: View metrics in collector logs before sending to New Relic. The processor and exporter names differ by monitoring method: + + **Java agent method**: + ```yaml + exporters: + debug: + verbosity: detailed + + otlp/newrelic: + endpoint: https://otlp.nr-data.net:4317 + headers: + api-key: ${env:NEW_RELIC_LICENSE_KEY} + compression: gzip + timeout: 30s + + service: + pipelines: + metrics/broker: + receivers: [otlp, kafkametrics] + processors: [resource, filter/exclude_cluster_metrics, filter/internal_topics, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, metricstransform/kafka_topic_sum_aggregation, filter/remove_partition_level_replicas, batch/aggregation] + exporters: [debug, otlp/newrelic] + + metrics/cluster: + receivers: [otlp] + processors: [resource, filter/include_cluster_metrics, transform/remove_broker_id, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, groupbyattrs/cluster, metricstransform/cluster_max, batch/aggregation] + exporters: [debug, otlp/newrelic] + ``` + + **Prometheus JMX Exporter method**: + ```yaml + exporters: + debug: + verbosity: detailed + + otlp/backend: + endpoint: ${NEW_RELIC_OTLP_ENDPOINT} + headers: + api-key: ${NEW_RELIC_LICENSE_KEY} + + service: + pipelines: + metrics/broker: + receivers: [prometheus/kafka-jmx] + processors: [resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/exclude_cluster_metrics, memory_limiter, cumulativetodelta, batch/export] + exporters: [debug, otlp/backend] + + metrics/cluster/prometheus: + receivers: [prometheus/kafka-jmx] + processors: [resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/include_cluster_metrics, transform/remove_broker_id, memory_limiter, cumulativetodelta, groupbyattrs/cluster, metricstransform/cluster_max, batch/export] + exporters: [debug, otlp/backend] + + metrics/cluster/kafkametrics: + receivers: [kafkametrics/cluster] + processors: [resource/cluster-name, filter/internal_topics, transform/remove_attributes, metricstransform/topic-aggregation, filter/exclude_partition_replicas_metric, memory_limiter, cumulativetodelta, batch/export] + exporters: [debug, otlp/backend] + ``` + + **Important**: Remove the debug exporter in production to avoid log overflow. + + + + **First, run the [Initial system checks](#initial-checks)** to verify the collector pod and broker pods are healthy. + + **Check collector logs for errors** (use the label matching your install method — see [Initial system checks](#initial-checks)): + ```bash + # Manifest + kubectl logs -n newrelic -l app=otel-collector --tail=100 | grep -i "error\|fail\|refuse" + + # Helm + kubectl logs -n newrelic -l app.kubernetes.io/name=opentelemetry-collector --tail=100 | grep -i "error\|fail\|refuse" + ``` + + **Verify the collector Service exists and has the right port**: + ```bash + # Manifest + kubectl get svc otel-collector -n newrelic + + # Helm + kubectl get svc -n newrelic -l app.kubernetes.io/name=opentelemetry-collector + ``` + + Ensure port `4317` is exposed as a ClusterIP service. + + + + **First, run the [Initial system checks](#initial-checks)** to verify the Java agent is attached to broker pods. + + **Check broker pod logs for Java agent initialization**: + ```bash + kubectl logs -n kafka kafka-0 --tail=100 | grep -i "otel\|jmx" + ``` + + **Verify KAFKA_OPTS is set correctly on broker pods**: + ```bash + kubectl exec -n kafka kafka-0 -- env | grep KAFKA_OPTS + ``` + + This should show `-javaagent:/otel-agent/opentelemetry-javaagent.jar` and all `-Dotel.*` parameters. Verify: + - `-Dotel.jmx.enabled=true` + - `-Dotel.jmx.config=/jmx-config/kafka-jmx-config.yaml` + - `-Dotel.exporter.otlp.endpoint=http://otel-collector.newrelic.svc.cluster.local:4317` + + **Verify JMX ConfigMap is mounted**: + ```bash + kubectl exec -n kafka kafka-0 -- ls -lh /jmx-config/ + kubectl exec -n kafka kafka-0 -- cat /jmx-config/kafka-jmx-config.yaml + ``` + + **Check collector logs for incoming JMX metrics**: + ```bash + kubectl logs -n newrelic -l app=otel-collector --tail=100 | grep -i "broker.id\|kafka\|jmx" + ``` + + + + **First, run the [Initial system checks](#initial-checks)** to verify the collector Service is reachable from broker pods. + + **Check for DNS resolution**: + ```bash + kubectl exec -n kafka kafka-0 -- nslookup otel-collector.newrelic.svc.cluster.local + ``` + + **Check collector logs for OTLP errors**: + ```bash + kubectl logs -n newrelic -l app=otel-collector --tail=100 | grep -i "connection refused\|context deadline exceeded\|failed to connect" + ``` + + **Verify OTLP receiver is listening on all interfaces**: + + Ensure the ConfigMap has `endpoint: "0.0.0.0:4317"` (not `127.0.0.1`) in the `otlp` receiver: + ```yaml + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + ``` + + + + **Monitor collector pod memory**: + ```bash + # Manifest + kubectl top pod -n newrelic -l app=otel-collector + + # Helm + kubectl top pod -n newrelic -l app.kubernetes.io/name=opentelemetry-collector + ``` + + **Reduce monitored topics**: + ```yaml + receivers: + kafkametrics: + brokers: ["kafka-0.kafka-headless.kafka.svc.cluster.local:9092"] + collection_interval: 30s + scrapers: + - brokers + - topics + - consumers + topic_match: "^(important-topic-1|important-topic-2)$" + ``` + + **Reduce collection frequency**: Increase intervals to collect less often + ```yaml + receivers: + kafkametrics: + collection_interval: 60s + ``` + + For JMX metrics from the Java agent, update `KAFKA_OPTS` in the StatefulSet: + ```yaml + - name: KAFKA_OPTS + value: >- + ... + -Dotel.metric.export.interval=60000 + ``` + + **Add a memory limiter**: + + Java agent method: + ```yaml + processors: + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + + service: + pipelines: + metrics/broker: + processors: [memory_limiter, resource, filter/exclude_cluster_metrics, filter/internal_topics, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, metricstransform/kafka_topic_sum_aggregation, filter/remove_partition_level_replicas, batch/aggregation] + ... + ``` + + Prometheus JMX Exporter method: + ```yaml + processors: + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + + service: + pipelines: + metrics/broker: + processors: [memory_limiter, resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/exclude_cluster_metrics, cumulativetodelta, batch/export] + ... + ``` + + After changes, apply the updated ConfigMap and restart the collector: + ```bash + kubectl apply -f collector-configmap.yaml + kubectl rollout restart deployment/otel-collector -n newrelic + ``` + + + +## Next steps [#next-steps] + +* [Explore Kafka metrics](/docs/opentelemetry/integrations/kafka/metrics-reference) - View the complete metrics reference +* [Create custom dashboards](/docs/query-your-data/explore-query-data/dashboards/introduction-dashboards) - Build visualizations for your Kafka data +* [Set up alerts](/docs/opentelemetry/integrations/kafka/find-and-query-data#alerts) - Monitor critical metrics like consumer lag and under-replicated partitions + diff --git a/src/content/docs/opentelemetry/integrations/kafka/kubernetes-strimzi.mdx b/src/content/docs/opentelemetry/integrations/kafka/kubernetes-strimzi.mdx index 34cc1759f2a..3b430b9189b 100644 --- a/src/content/docs/opentelemetry/integrations/kafka/kubernetes-strimzi.mdx +++ b/src/content/docs/opentelemetry/integrations/kafka/kubernetes-strimzi.mdx @@ -30,7 +30,7 @@ Follow these steps to set up monitoring for your Kafka cluster: -### Before you begin +### Before you begin [#prerequisites] Ensure you have: * A [New Relic account](https://newrelic.com/signup) with a @@ -47,7 +47,7 @@ Configure your Strimzi Kafka cluster to expose Kafka JMX metrics via the Prometh **Create JMX metrics ConfigMap** -Create a ConfigMap with JMX Exporter patterns that define which Kafka metrics to collect. Save as `kafka-jmx-metrics-config.yaml`: +Create a ConfigMap with JMX Exporter patterns that define which Kafka metrics to collect. Save as `kafka-jmx-config.yaml`: ```yaml apiVersion: v1 @@ -337,7 +337,7 @@ data: Apply the ConfigMap: ```bash -kubectl apply -f kafka-jmx-metrics-config.yaml +kubectl apply -f kafka-jmx-config.yaml ``` **Update Kafka cluster to use JMX Exporter** @@ -414,7 +414,7 @@ Create a Kubernetes secret containing your New Relic license key and OTLP endpoi kubectl create secret generic newrelic-otlp-secret \ --namespace newrelic \ --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ - --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://eu01-otlp.nr-data.net:4317' + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.eu01.nr-data.net:4317' ``` @@ -487,7 +487,7 @@ extraEnvsFrom: - secretRef: name: newrelic-otlp-secret -# Disable default ports +# Disable unused default ports ports: jaeger-compact: enabled: false @@ -501,7 +501,7 @@ ports: # OpenTelemetry Collector Configuration config: receivers: - # Disable default receivers not available in NRDOT experimental + # Disable default receivers not needed in NRDOT jaeger: null zipkin: null @@ -735,11 +735,10 @@ config: service: pipelines: - # Override default traces pipeline to only use receivers that exist in NRDOT - traces: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [debug] + # Suppress default pipelines — only custom Kafka metrics pipelines are used + traces: null + logs: null + metrics: null # Broker-level metrics from Prometheus JMX scraping metrics/broker: @@ -793,7 +792,7 @@ config: - otlp/backend ``` - **Customize for your cluster**: Update the TODO items in the above helm configure file: + **Customize for your cluster**: Update the TODO items in `values.yaml`: * TODO#1: Replace with your Kafka bootstrap service * TODO#2: Replace with the namespace where your Kafka cluster is deployed * TODO#3: Replace with your Strimzi Kafka cluster name followed by `-kafka` @@ -1153,7 +1152,7 @@ config: - otlp/backend ``` - **Customize for your cluster**: Update the TODO items in the above helm configure file: + **Customize for your cluster**: Update the TODO items in `values.yaml`: * TODO#1: Replace with your Kafka bootstrap service * TODO#2: Replace with the namespace where your Kafka cluster is deployed * TODO#3: Replace with your Strimzi Kafka cluster name followed by `-kafka` @@ -1223,7 +1222,7 @@ Create a Kubernetes secret containing your New Relic license key and OTLP endpoi kubectl create secret generic newrelic-otlp-secret \ --namespace newrelic \ --from-literal=NEW_RELIC_LICENSE_KEY='your-license-key-here' \ - --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://eu01-otlp.nr-data.net:4317' + --from-literal=NEW_RELIC_OTLP_ENDPOINT='https://otlp.eu01.nr-data.net:4317' ``` @@ -1732,14 +1731,14 @@ You should see logs indicating successful scraping from Kafka brokers on port 94 ### (Optional) Instrument producer or consumer applications [#instrument-apps] - **Language support**: Java applications support out-of-the-box Kafka client instrumentation using the OpenTelemetry Java Agent. + **Language support**: Java applications support out-of-the-box Kafka client instrumentation using the OpenTelemetry Java agent. -To collect application-level telemetry from your Kafka producer and consumer applications, use the OpenTelemetry Java Agent. +To collect application-level telemetry from your Kafka producer and consumer applications, use the OpenTelemetry Java agent. #### Instrument your Kafka application -Use an init container to download the OpenTelemetry Java Agent at runtime: +Use an init container to download the OpenTelemetry Java agent at runtime: ```yaml apiVersion: apps/v1 @@ -1850,7 +1849,7 @@ spec: -The Java Agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing: +The Java agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing: * Request latencies * Throughput metrics * Error rates @@ -1862,16 +1861,40 @@ For advanced configuration, see the [Kafka instrumentation documentation](https: -## Find your data +## Find your data [#find-data] + +After a few minutes, your Kafka data should appear in New Relic. See [Find your data](/docs/opentelemetry/integrations/kafka/find-and-query-data) for detailed instructions on exploring your Kafka data across different views in the New Relic UI. + +**Metrics** + +Broker, topic, partition, consumer group, and JVM metrics are stored in the `Metric` event type. Replace `my-kafka-cluster` with your `KAFKA_CLUSTER_NAME` value: + +```sql +FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` + -After a few minutes, your Kafka metrics should appear in New Relic. See [Find your data](/docs/opentelemetry/integrations/kafka/find-and-query-data) for detailed instructions on exploring your Kafka metrics across different views in the New Relic UI. +**Logs** -You can also query your data with NRQL: +Strimzi does not inject an application log exporter by default. If you deploy producer or consumer applications instrumented with the OpenTelemetry Java agent, their logs are stored in the `Log` event type: ```sql -FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' +FROM Log SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago ``` +**Traces** + +If you deploy producer or consumer applications instrumented with the OpenTelemetry Java agent, producer and consumer spans are stored in the `Span` event type: + +```sql +FROM Span SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` + + +## Example [#example] + +A complete working example with Strimzi Kafka custom resources, JMX Exporter configuration, OTel Collector setup, and sample producer/consumer applications is available in the [New Relic OpenTelemetry Examples repository](https://github.com/newrelic/newrelic-opentelemetry-examples/tree/main/other-examples/collector/kafka/k8s-strimzi). + ## Troubleshooting [#troubleshooting] @@ -1912,8 +1935,6 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' exporters: debug: verbosity: detailed - sampling_initial: 5 # Log first 5 metrics - sampling_thereafter: 200 # Then log every 200th metric otlp/backend: endpoint: ${NEW_RELIC_OTLP_ENDPOINT} @@ -2323,7 +2344,6 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' exporters: debug: verbosity: detailed - sampling_initial: 100 # Log first 100 metrics to see what's available service: pipelines: @@ -2347,7 +2367,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' **1. Remove the Additional metrics section from the JMX ConfigMap** - In your `kafka-jmx-metrics-config.yaml` ConfigMap, delete everything below this comment (through the end of the `rules:` list): + In your `kafka-jmx-config.yaml` ConfigMap, delete everything below this comment (through the end of the `rules:` list): ```yaml # Additional metrics — remove this section to reduce data ingest @@ -2360,7 +2380,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' After editing the ConfigMap, apply it and restart the Kafka brokers to pick up the change: ```bash - kubectl apply -f kafka-jmx-metrics-config.yaml + kubectl apply -f kafka-jmx-config.yaml kubectl rollout restart statefulset -n kafka -kafka ``` @@ -2387,4 +2407,4 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' * [Explore Kafka metrics](/docs/opentelemetry/integrations/kafka/metrics-reference) - View the complete metrics reference * [Create custom dashboards](/docs/query-your-data/explore-query-data/dashboards/introduction-dashboards) - Build visualizations for your Kafka data -* [Set up alerts](/docs/opentelemetry/integrations/kafka/metrics-reference/#alerting) - Monitor critical metrics like consumer lag and under-replicated partitions +* [Set up alerts](/docs/opentelemetry/integrations/kafka/find-and-query-data#alerts) - Monitor critical metrics like consumer lag and under-replicated partitions diff --git a/src/content/docs/opentelemetry/integrations/kafka/metrics-reference.mdx b/src/content/docs/opentelemetry/integrations/kafka/metrics-reference.mdx index d017dff69b3..1bd2e1cd8ac 100644 --- a/src/content/docs/opentelemetry/integrations/kafka/metrics-reference.mdx +++ b/src/content/docs/opentelemetry/integrations/kafka/metrics-reference.mdx @@ -162,7 +162,7 @@ These metrics are collected from Kafka brokers using the Kafka protocol (bootstr `kafka.partition.replicas_in_sync` - Number of in-sync replicas for a partition + Number of in-sync replicas for a partition. This metric is filtered out by the collector pipeline — only the aggregated `kafka.partition.replicas_in_sync.total` per topic is retained. Sum (int) @@ -299,7 +299,8 @@ These metrics are collected from Kafka brokers using the Kafka protocol (bootstr JMX metrics provide detailed Kafka broker and JVM telemetry. These metrics are collected using: -* **Self-hosted Kafka**: [OpenTelemetry Java Agent](/docs/opentelemetry/integrations/kafka/self-hosted#jmx-config) with custom JMX configuration +* **Self-hosted Kafka**: [OpenTelemetry Java Agent](/docs/opentelemetry/integrations/kafka/self-hosted#java-agent) or [Prometheus JMX Exporter](/docs/opentelemetry/integrations/kafka/self-hosted#prometheus) +* **Kubernetes (self-managed)**: [OpenTelemetry Java Agent](/docs/opentelemetry/integrations/kafka/kubernetes-self-managed#java-agent) or [Prometheus JMX Exporter](/docs/opentelemetry/integrations/kafka/kubernetes-self-managed#prometheus) * **Kubernetes (Strimzi)**: [Prometheus JMX Exporter](/docs/opentelemetry/integrations/kafka/kubernetes-strimzi#configure-jmx-exporter) with New Relic custom configuration Both methods collect the same set of Kafka broker and JVM metrics documented below: @@ -349,7 +350,7 @@ These metrics are collected from the controller broker and provide cluster-wide `kafka.leader.election.rate` - The leader election count + The leader election count. Only appears when a leader election occurs; not emitted on stable clusters. Counter @@ -582,7 +583,7 @@ These metrics are collected from the controller broker and provide cluster-wide - `kafka.lag.max` + `kafka.max.lag` Maximum lag between follower and leader replicas @@ -675,6 +676,18 @@ These metrics are collected from the controller broker and provide cluster-wide Gauge + + + + `kafka.request.queue` + + + Size of the request queue on the broker + + + Gauge + + @@ -943,7 +956,7 @@ These metrics are collected from the controller broker and provide cluster-wide -## Kafka client metrics (OpenTelemetry Java agent) [#kafka-client-metrics] +## Kafka client metrics [#kafka-client-metrics] These metrics are collected from Kafka producer and consumer applications instrumented with the [OpenTelemetry Java agent](https://opentelemetry.io/docs/languages/java/automatic/) with Kafka instrumentation enabled. These provide client-side visibility into application interactions with Kafka brokers and complement the broker-side metrics by providing the application perspective. @@ -965,6 +978,8 @@ These metrics are collected from Kafka producer and consumer applications instru | `kafka.producer.network_io_total` | Total network operations | client-id | | `kafka.producer.outgoing_byte_rate` | Rate of outgoing bytes | client-id, node-id | | `kafka.producer.outgoing_byte_total` | Total outgoing bytes | client-id, node-id | +| `kafka.producer.incoming_byte_rate` | Rate of incoming bytes (responses from brokers) | client-id, node-id | +| `kafka.producer.incoming_byte_total` | Total incoming bytes (responses from brokers) | client-id, node-id | ### Request and response metrics @@ -1163,12 +1178,12 @@ These metrics are collected from Kafka producer and consumer applications instru | `kafka.consumer.failed_rebalance_total` | Total failed rebalances | client-id | | `kafka.consumer.failed_rebalance_rate_per_hour` | Failed rebalances per hour | client-id | | `kafka.consumer.last_rebalance_seconds_ago` | Seconds since last rebalance | client-id | -| `kafka.consumer.partition_assigned_latency_avg` | Average partition assignment latency (ms) | client-id | -| `kafka.consumer.partition_assigned_latency_max` | Maximum partition assignment latency (ms) | client-id | -| `kafka.consumer.partition_revoked_latency_avg` | Average partition revocation latency (ms) | client-id | -| `kafka.consumer.partition_revoked_latency_max` | Maximum partition revocation latency (ms) | client-id | -| `kafka.consumer.partition_lost_latency_avg` | Average partition loss latency (ms) | client-id | -| `kafka.consumer.partition_lost_latency_max` | Maximum partition loss latency (ms) | client-id | +| `kafka.consumer.partition_assigned_latency_avg` | Average partition assignment latency (ms). Only emitted during consumer group rebalances. | client-id | +| `kafka.consumer.partition_assigned_latency_max` | Maximum partition assignment latency (ms). Only emitted during consumer group rebalances. | client-id | +| `kafka.consumer.partition_revoked_latency_avg` | Average partition revocation latency (ms). Only emitted during consumer group rebalances. | client-id | +| `kafka.consumer.partition_revoked_latency_max` | Maximum partition revocation latency (ms). Only emitted during consumer group rebalances. | client-id | +| `kafka.consumer.partition_lost_latency_avg` | Average partition loss latency (ms). Only emitted during consumer group rebalances. | client-id | +| `kafka.consumer.partition_lost_latency_max` | Maximum partition loss latency (ms). Only emitted during consumer group rebalances. | client-id | ### Sync group metrics diff --git a/src/content/docs/opentelemetry/integrations/kafka/overview.mdx b/src/content/docs/opentelemetry/integrations/kafka/overview.mdx index 1f2938498a4..8116cf46033 100644 --- a/src/content/docs/opentelemetry/integrations/kafka/overview.mdx +++ b/src/content/docs/opentelemetry/integrations/kafka/overview.mdx @@ -9,7 +9,7 @@ metaDescription: "Monitor Kafka with OpenTelemetry Collector for comprehensive m freshnessValidatedDate: never --- -Kafka monitoring provides real-time visibility into your Apache Kafka clusters to ensure reliable data streaming and prevent costly downtime in distributed systems. Using a collector-based approach, you get comprehensive monitoring through a flexible, vendor-neutral solution that works across self-hosted environments and Kubernetes with Strimzi. +Monitor Apache Kafka clusters with OpenTelemetry Collector for real-time visibility and reliable data streaming. This vendor-neutral solution prevents costly downtime across self-hosted and Kubernetes environments. ## Collector options [#collector-options] @@ -31,14 +31,14 @@ Choose the collector that best fits your support and operational requirements, t ## Why Kafka monitoring? -- Prevent outages - Get alerts for broker failures, under-replicated partitions, and offline topics before they cause downtime -- Optimize performance - Identify consumer lag, slow producers, and network bottlenecks that affect data processing speed -- Plan capacity - Track resource usage, message rates, and connection counts to scale proactively -- Ensure data integrity - Monitor replication health and partition balance to prevent data loss +- **Prevent outages**: Get alerts for broker failures, under-replicated partitions, and offline topics before they cause downtime +- **Optimize performance**: Identify consumer lag, slow producers, and network bottlenecks that affect data processing speed +- **Plan capacity**: Track resource usage, message rates, and connection counts to scale proactively +- **Ensure data integrity**: Monitor replication health and partition balance to prevent data loss ### Common use case -Whether you're streaming financial transactions, processing IoT sensor data, or handling microservices communication, Kafka monitoring helps you catch issues before they impact your business. Get alerted when consumer lag spikes threaten real-time dashboards, when broker failures risk data loss, or when network bottlenecks slow down critical data pipelines. This monitoring is essential for e-commerce platforms, real-time analytics systems, and any application where message delivery delays or failures can affect user experience or business operations. +Kafka monitoring helps you catch issues before they impact your business. Get alerted when consumer lag spikes threaten real-time dashboards, broker failures risk data loss, or network bottlenecks slow critical data pipelines. Essential for financial transactions, IoT data processing, microservices communication, e-commerce platforms, and real-time analytics. ## Get started [#get-started] @@ -51,6 +51,12 @@ Choose your Kafka environment to begin monitoring. Each setup guide includes pre icon="logo-linux" /> + + ## Installation steps [#installation-steps] -Follow these steps to set up comprehensive Kafka monitoring by installing the OpenTelemetry Java Agent on your brokers and deploying a collector to gather and send metrics to New Relic. + + + Via OTel Java Agent + Via Prometheus JMX Exporter + + + + +Follow these steps to set up comprehensive Kafka monitoring by installing the OpenTelemetry Java Agent on your brokers and deploying a collector to gather and send metrics and logs to New Relic. @@ -40,8 +48,368 @@ Ensure you have: +### Create collector configuration [#collector-config] + +Create the main OpenTelemetry Collector configuration at `~/opentelemetry/collector-kafka-config.yaml` **on a monitoring host**. + +```yaml +receivers: + # OTLP receiver for Kafka and JMX metrics from Java agents and application telemetry + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + # Kafka metrics receiver for cluster-level metrics + kafkametrics: + brokers: ${env:KAFKA_BOOTSTRAP_BROKER_ADDRESSES} + protocol_version: 2.0.0 + scrapers: + - brokers + - topics + - consumers + collection_interval: 30s + # Exclude internal Kafka topics (prefixed with __) at the source + topic_match: "^[^_].*$" + metrics: + kafka.topic.min_insync_replicas: + enabled: true + kafka.topic.replication_factor: + enabled: true + kafka.partition.replicas: + enabled: false + kafka.partition.oldest_offset: + enabled: false + kafka.partition.current_offset: + enabled: false + +processors: + batch/aggregation: + send_batch_size: 1024 + timeout: 30s + + resourcedetection: + detectors: [env, ec2, system] + system: + resource_attributes: + host.name: + enabled: true + host.id: + enabled: true + + resource: + attributes: + - action: insert + key: kafka.cluster.name + value: ${env:KAFKA_CLUSTER_NAME} + + transform/remove_broker_id: + metric_statements: + # Remove broker.id for cluster-level metrics — these represent the whole cluster, + # not a specific broker. broker.id is retained on broker-level metrics pipelines. + - context: resource + statements: + - delete_key(attributes, "broker.id") + + transform/remove_extra_attributes: + metric_statements: + - context: resource + statements: + # Delete all attributes starting with "process." + - delete_matching_keys(attributes, "^process\\..*") + # Delete all attributes starting with "telemetry." + - delete_matching_keys(attributes, "^telemetry\\..*") + - delete_key(attributes, "host.arch") + - delete_key(attributes, "os.description") + - delete_key(attributes, "host.image.id") + - delete_key(attributes, "host.type") + - delete_matching_keys(attributes, "^cloud\\..*") + - delete_key(attributes, "service.instance.id") where IsMatch(attributes["service.name"], "^unknown_service:") + - delete_key(attributes, "service.name") where IsMatch(attributes["service.name"], "^unknown_service:") + + # Filter internal Kafka topics as a safety net (kafkametrics topic_match handles the receiver side) + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + + filter/include_cluster_metrics: + metrics: + include: + match_type: regexp + metric_names: + - "kafka\\.partition\\.offline" + - "kafka\\.(leader|unclean)\\.election\\.rate" + - "kafka\\.partition\\.non_preferred_leader" + - "kafka\\.broker\\.fenced\\.count" + - "kafka\\.cluster\\.partition\\.count" + - "kafka\\.cluster\\.topic\\.count" + + filter/exclude_cluster_metrics: + metrics: + exclude: + match_type: regexp + metric_names: + - "kafka\\.partition\\.offline" + - "kafka\\.(leader|unclean)\\.election\\.rate" + - "kafka\\.partition\\.non_preferred_leader" + - "kafka\\.broker\\.fenced\\.count" + - "kafka\\.cluster\\.partition\\.count" + - "kafka\\.cluster\\.topic\\.count" + + transform/des_units: + metric_statements: + - context: metric + statements: + - set(description, "") where description != "" + - set(unit, "") where unit != "" + + cumulativetodelta: + + metricstransform/kafka_topic_sum_aggregation: + transforms: + - include: kafka.partition.replicas_in_sync + action: insert + new_name: kafka.partition.replicas_in_sync.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + + - include: kafka.partition.replicas + action: insert + new_name: kafka.partition.replicas.total + operations: + - action: aggregate_labels + label_set: [topic] + aggregation_type: sum + + filter/remove_partition_level_replicas: + metrics: + exclude: + match_type: strict + metric_names: + - kafka.partition.replicas_in_sync + + groupbyattrs/cluster: + keys: [kafka.cluster.name] + + metricstransform/cluster_max: + transforms: + - include: "kafka\\.partition\\.offline|kafka\\.leader\\.election\\.rate|kafka\\.unclean\\.election\\.rate|kafka\\.partition\\.non_preferred_leader|kafka\\.broker\\.fenced\\.count|kafka\\.cluster\\.partition\\.count|kafka\\.cluster\\.topic\\.count" + match_type: regexp + action: update + operations: + - action: aggregate_labels + aggregation_type: max + label_set: [] + +exporters: + otlp/newrelic: + endpoint: ${env:NEW_RELIC_OTLP_ENDPOINT} + headers: + api-key: ${env:NEW_RELIC_LICENSE_KEY} + compression: gzip + timeout: 30s + +service: + pipelines: + # Broker metrics pipeline (excludes cluster-level metrics) + metrics/broker: + receivers: [otlp, kafkametrics] + processors: + - resourcedetection + - resource + - filter/exclude_cluster_metrics + - filter/internal_topics + - transform/remove_extra_attributes + - transform/des_units + - cumulativetodelta + - metricstransform/kafka_topic_sum_aggregation + - filter/remove_partition_level_replicas + - batch/aggregation + exporters: [otlp/newrelic] + + # Cluster metrics pipeline (controller-emitted metrics like offline partitions, topic/partition counts — no broker.id) + metrics/cluster: + receivers: [otlp] + processors: + - resourcedetection + - resource + - filter/include_cluster_metrics + - transform/remove_broker_id + - transform/remove_extra_attributes + - transform/des_units + - cumulativetodelta + - groupbyattrs/cluster + - metricstransform/cluster_max + - batch/aggregation + exporters: [otlp/newrelic] + + # APM traces pipeline (producer + consumer spans via OTel Java Agent) + traces/apps: + receivers: [otlp] + processors: [resourcedetection, resource, batch/aggregation] + exporters: [otlp/newrelic] + + # APM logs pipeline (producer + consumer logs via OTel Java Agent) + logs/apps: + receivers: [otlp] + processors: [resourcedetection, resource, batch/aggregation] + exporters: [otlp/newrelic] +``` + + + + **Architecture highlights:** + + * **OTLP receiver**: Receives Kafka and JMX metrics from OpenTelemetry Java Agent running on Kafka brokers via gRPC on port 4317 + * **Two pipelines approach**: Cluster-level metrics are sent without broker.id to map to cluster entity + * **Metric filtering**: Separates broker-specific metrics from cluster-level metrics to avoid duplication + * **Aggregation**: Automatically aggregates partition-level metrics by topic + + + + For advanced configuration options, refer to these receiver documentation pages: + + * [OTLP receiver documentation](https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver) - OTLP receiver configuration options + * [Kafka metrics receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kafkametricsreceiver) - Additional Kafka metrics configuration + + + + + + + +### Set environment variables [#env-vars] + +Set the required environment variables **on the monitoring host** before installing the collector: + +```bash +export NEW_RELIC_LICENSE_KEY="YOUR_LICENSE_KEY" +export KAFKA_CLUSTER_NAME="my-kafka-cluster" +export KAFKA_BOOTSTRAP_BROKER_ADDRESSES="broker1-host:9092,broker2-host:9092,broker3-host:9092" +export NEW_RELIC_OTLP_ENDPOINT="https://otlp.nr-data.net:4317" # US region +``` + +Replace: +* `YOUR_LICENSE_KEY` with your New Relic license key +* `my-kafka-cluster` with a unique name for your Kafka cluster +* `broker1-host:9092,broker2-host:9092,broker3-host:9092` with your Kafka bootstrap broker addresses +* OTLP endpoint: Uses `https://otlp.nr-data.net:4317` (US region) or `https://otlp.eu01.nr-data.net:4317` (EU region). +For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol) + + + + + +### Install and start the collector [#install-start-collector] + +Install and run the collector **on the monitoring host**. Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Collector: + + + + NRDOT Collector + OpenTelemetry Collector + + + + + + **NRDOT Collector** is New Relic's distribution of OpenTelemetry Collector with New Relic support for assistance. + + + **Download and install the binary** + + Download and install the NRDOT Collector binary for your host operating system. The example below is for **linux_amd64** architecture: + + ```bash + # Set version and architecture + NRDOT_VERSION="1.9.0" + ARCH="amd64" # or arm64 + + # Download and extract + curl "https://github.com/newrelic/nrdot-collector-releases/releases/download/${NRDOT_VERSION}/nrdot-collector_${NRDOT_VERSION}_linux_${ARCH}.tar.gz" \ + --location --output collector.tar.gz + tar -xzf collector.tar.gz + + # Move to a location in PATH (optional) + sudo mv nrdot-collector /usr/local/bin/ + + # Verify installation + nrdot-collector --version + ``` + + For other operating systems and architectures, visit [NRDOT Collector releases](https://github.com/newrelic/nrdot-collector-releases/releases/latest) and download the appropriate binary for your system. + + + **Start the collector** + + Run the collector with your configuration file to begin monitoring: + + ```bash + nrdot-collector --config ~/opentelemetry/collector-kafka-config.yaml + ``` + + The collector will start sending Kafka metrics to New Relic within a few minutes. + + + + **Download and install the binary** + + Download and install the OpenTelemetry Collector Contrib binary for your host operating system. The example below is for **linux_amd64** architecture: + + ```bash + # Set version and architecture + # Check https://github.com/open-telemetry/opentelemetry-collector-releases/releases/latest for the latest version + OTEL_VERSION="" + ARCH="amd64" + + # Download the collector + curl -L -o otelcol-contrib.tar.gz \ + "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v${OTEL_VERSION}/otelcol-contrib_${OTEL_VERSION}_linux_${ARCH}.tar.gz" + + # Extract the binary + tar -xzf otelcol-contrib.tar.gz + + # Move to a location in PATH (optional) + sudo mv otelcol-contrib /usr/local/bin/ + + # Verify installation + otelcol-contrib --version + ``` + + For other operating systems, visit the [OpenTelemetry Collector releases](https://github.com/open-telemetry/opentelemetry-collector-releases/releases/latest) page. + + **Start the collector** + + Run the collector with your configuration file to begin monitoring: + + ```bash + otelcol-contrib --config ~/opentelemetry/collector-kafka-config.yaml + ``` + + The collector will start sending Kafka metrics to New Relic within a few minutes. + + + + + + + + ### Download the OpenTelemetry Java Agent [#download-java-agent] + + Ensure your OpenTelemetry Collector is running before you (re)start Kafka brokers with the Java Agent attached. The agent begins sending metrics immediately on broker startup, so the collector must be available to receive them. + + The [OpenTelemetry Java Agent](https://opentelemetry.io/docs/zero-code/java/agent/) runs as a Java agent attached to your Kafka brokers, collecting Kafka and JMX metrics and sending them via OTLP to the collector: ```bash @@ -61,7 +429,7 @@ curl -L -o ~/opentelemetry/opentelemetry-javaagent.jar \ Create an OpenTelemetry Java Agent JMX configuration file to collect Kafka metrics from JMX MBeans. -Create the file `~/opentelemetry/jmx-custom-config.yaml` with the following configuration: +Create the file `~/opentelemetry/kafka-jmx-config.yaml` on each broker host with the following configuration: ```yaml --- @@ -499,55 +867,472 @@ rules: -### Configure Kafka broker [#configure-broker] +### Configure Kafka broker [#configure-broker] + +Attach the OpenTelemetry Java Agent to your Kafka broker by setting the `KAFKA_OPTS` environment variable before starting Kafka. + +**Single broker example**: + +```bash +OTEL_AGENT="$HOME/opentelemetry/opentelemetry-javaagent.jar" +JMX_CONFIG="$HOME/opentelemetry/kafka-jmx-config.yaml" + +nohup env KAFKA_OPTS="-javaagent:$OTEL_AGENT \ + -Dotel.jmx.enabled=true \ + -Dotel.jmx.config=$JMX_CONFIG \ + -Dotel.resource.attributes=broker.id=1,kafka.cluster.name=my-kafka-cluster \ + -Dotel.exporter.otlp.endpoint=http://collector-host-ip:4317 \ + -Dotel.exporter.otlp.protocol=grpc \ + -Dotel.metrics.exporter=otlp \ + -Dotel.instrumentation.runtime-telemetry.enabled=false \ + -Dotel.metric.export.interval=30000" \ + bin/kafka-server-start.sh config/server.properties & +``` + + + **Multi-broker clusters**: For multiple brokers, use the same configuration with unique `broker.id` values (e.g., `broker.id=1`, `broker.id=2`, `broker.id=3`) in the `-Dotel.resource.attributes` parameter for each broker. + + +Replace: +* `collector-host-ip:4317` with the IP or hostname of the host running your OpenTelemetry Collector +* `broker.id=1` with the unique broker ID for each broker (e.g., `broker.id=1`, `broker.id=2`, `broker.id=3`) +* `kafka.cluster.name=my-kafka-cluster` with your Kafka cluster name (must match the value set in the collector configuration) + +For complete configuration options, see the [Java Agent configuration guide](https://opentelemetry.io/docs/zero-code/java/agent/configuration/). + + + + + +### (Optional) Instrument producer or consumer applications [#instrument-apps] + + + **Language support**: Currently, only Java applications are supported for Kafka client instrumentation using the OpenTelemetry Java Agent. + + +To collect application-level telemetry from your Kafka producer and consumer applications, download the OpenTelemetry Java Agent from the [Download the OpenTelemetry Java Agent](#download-java-agent) step above. + +Start your application with the agent: + +```bash +OTEL_AGENT="$HOME/opentelemetry/opentelemetry-javaagent.jar" + +java \ + -javaagent:$OTEL_AGENT \ + -Dotel.service.name="order-process-service" \ + -Dotel.resource.attributes="kafka.cluster.name=my-kafka-cluster" \ + -Dotel.exporter.otlp.endpoint=http://collector-host-ip:4317 \ + -Dotel.exporter.otlp.protocol="grpc" \ + -Dotel.metrics.exporter="otlp" \ + -Dotel.traces.exporter="otlp" \ + -Dotel.logs.exporter="otlp" \ + -Dotel.instrumentation.kafka.experimental-span-attributes="true" \ + -Dotel.instrumentation.messaging.experimental.receive-telemetry.enabled="true" \ + -Dotel.instrumentation.kafka.producer-propagation.enabled="true" \ + -Dotel.instrumentation.kafka.enabled="true" \ + -Dotel.instrumentation.runtime-telemetry.enabled="false" \ + -jar your-kafka-application.jar +``` + +Replace: +* `order-process-service` with a unique name for your producer or consumer application +* `my-kafka-cluster` with the same cluster name used in your collector configuration +* `collector-host-ip:4317` with the hostname or IP of the host running your OpenTelemetry Collector + + + The configuration above sends telemetry to an OpenTelemetry Collector running on `collector-host-ip:4317`. If you want a separate collector dedicated to application telemetry, create one with the following configuration: + + + + ```yaml + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + + exporters: + otlp/newrelic: + endpoint: https://otlp.nr-data.net:4317 + headers: + api-key: "${NEW_RELIC_LICENSE_KEY}" + compression: gzip + timeout: 30s + + service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlp/newrelic] + metrics: + receivers: [otlp] + exporters: [otlp/newrelic] + logs: + receivers: [otlp] + exporters: [otlp/newrelic] + ``` + + + + +The Java Agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing request latencies, throughput metrics, error rates, and distributed traces. + +For advanced configuration, see the [Kafka instrumentation documentation](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/kafka). + + + + + + + + + +Follow these steps to set up comprehensive Kafka monitoring by installing the Prometheus JMX Exporter on your brokers and deploying a collector to gather and send metrics to New Relic. + + + + + +### Before you begin [#prerequisites] + +Ensure you have: + +* A [New Relic account](https://newrelic.com/signup) with a +* Network access from the collector host to each broker on port `9404` +* Network access from the collector to Kafka bootstrap port (typically `9092`) + + + + + +### Download the Prometheus JMX Exporter [#download-jmx-exporter] + +Download the Prometheus JMX Exporter JAR on each Kafka broker host: + +```bash +# Create directory for Prometheus components +mkdir -p ~/opentelemetry + +# Download the Prometheus JMX Exporter agent JAR +# Version 1.5.0 is the minimum required version. Check https://github.com/prometheus/jmx_exporter/releases/latest for newer releases. +JMX_EXPORTER_VERSION="1.5.0" +curl -L -o ~/opentelemetry/jmx_prometheus_javaagent.jar \ + "https://github.com/prometheus/jmx_exporter/releases/download/${JMX_EXPORTER_VERSION}/jmx_prometheus_javaagent-${JMX_EXPORTER_VERSION}.jar" +``` + + + + + + +### Create JMX metrics configuration [#jmx-config] + +Create the JMX Exporter configuration file that defines which Kafka metrics to collect. Save as `~/opentelemetry/kafka-jmx-config.yaml` on each broker host: + +```yaml +startDelaySeconds: 0 +lowercaseOutputName: true +lowercaseOutputLabelNames: true + +rules: + # Cluster-level controller metrics + - pattern: 'kafka.controller<>Value' + name: kafka_cluster_topic_count + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_cluster_partition_count + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_broker_fenced_count + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_partition_non_preferred_leader + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_partition_offline + type: GAUGE + + - pattern: 'kafka.controller<>Value' + name: kafka_controller_active_count + type: GAUGE + + # Broker-level replica metrics + - pattern: 'kafka.server<>Value' + name: kafka_partition_under_min_isr + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_broker_leader_count + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_partition_count + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_partition_under_replicated + type: GAUGE + + - pattern: 'kafka.server<>Count' + name: kafka_isr_operation_count + type: COUNTER + labels: + operation: "shrink" + + - pattern: 'kafka.server<>Count' + name: kafka_isr_operation_count + type: COUNTER + labels: + operation: "expand" + + - pattern: 'kafka.server<>Value' + name: kafka_max_lag + type: GAUGE + + # Broker topic metrics (totals) + - pattern: 'kafka.server<>Count' + name: kafka_message_count + type: COUNTER + + - pattern: 'kafka.server<>Count' + name: kafka_request_count + type: COUNTER + labels: + type: "fetch" + + - pattern: 'kafka.server<>Count' + name: kafka_request_count + type: COUNTER + labels: + type: "produce" + + - pattern: 'kafka.server<>Count' + name: kafka_request_failed + type: COUNTER + labels: + type: "fetch" + + - pattern: 'kafka.server<>Count' + name: kafka_request_failed + type: COUNTER + labels: + type: "produce" + + - pattern: 'kafka.server<>Count' + name: kafka_network_io + type: COUNTER + labels: + direction: "in" + + - pattern: 'kafka.server<>Count' + name: kafka_network_io + type: COUNTER + labels: + direction: "out" + + # Per-topic metrics (only appear after traffic flows) + - pattern: 'kafka.server<>Count' + name: kafka_prod_msg_count + type: COUNTER + labels: + topic: "$1" + + - pattern: 'kafka.server<>Count' + name: kafka_topic_io + type: COUNTER + labels: + topic: "$1" + direction: "in" + + - pattern: 'kafka.server<>Count' + name: kafka_topic_io + type: COUNTER + labels: + topic: "$1" + direction: "out" + + # Request metrics + - pattern: 'kafka.network<>99thPercentile' + name: kafka_request_time_99p + type: GAUGE + labels: + type: "$1" + + - pattern: 'kafka.network<>Value' + name: kafka_request_queue + type: GAUGE + + - pattern: 'kafka.server<>Value' + name: kafka_purgatory_size + type: GAUGE + labels: + type: "$1" + + # Controller stats + - pattern: 'kafka.controller<>Count' + name: kafka_leader_election_rate + type: COUNTER + + - pattern: 'kafka.controller<>Count' + name: kafka_unclean_election_rate + type: COUNTER + + # JVM Garbage Collection + - pattern: 'java.lang<>CollectionCount' + name: jvm_gc_collections_count + type: COUNTER + labels: + name: "$1" + + # JVM Memory + - pattern: 'java.langmax' + name: jvm_memory_heap_max + type: GAUGE + + - pattern: 'java.langused' + name: jvm_memory_heap_used + type: GAUGE + + # JVM Threading and System + - pattern: 'java.lang<>ThreadCount' + name: jvm_thread_count + type: GAUGE + + - pattern: 'java.lang<>SystemCpuLoad' + name: jvm_system_cpu_utilization + type: GAUGE + + # Broker uptime + - pattern: 'java.lang<>Uptime' + name: kafka_broker_uptime + type: GAUGE + + # Additional metrics — remove this section to reduce data ingest + + # Request latency: total count, 50th percentile, and average (99p kept above) + - pattern: 'kafka.network<>Count' + name: kafka_request_time_total + type: COUNTER + labels: + type: "$1" + + - pattern: 'kafka.network<>50thPercentile' + name: kafka_request_time_50p + type: GAUGE + labels: + type: "$1" + + - pattern: 'kafka.network<>Mean' + name: kafka_request_time_avg + type: GAUGE + labels: + type: "$1" + + # Log flush metrics + - pattern: 'kafka.log<>Count' + name: kafka_logs_flush_count + type: COUNTER + + - pattern: 'kafka.log<>50thPercentile' + name: kafka_logs_flush_time_50p + type: GAUGE + + - pattern: 'kafka.log<>99thPercentile' + name: kafka_logs_flush_time_99p + type: GAUGE + + # JVM GC elapsed time + - pattern: 'java.lang<>CollectionTime' + name: jvm_gc_collections_elapsed + type: COUNTER + labels: + name: "$1" + + # JVM Memory heap committed + - pattern: 'java.langcommitted' + name: jvm_memory_heap_committed + type: GAUGE + + # JVM class loading + - pattern: 'java.lang<>LoadedClassCount' + name: jvm_class_count + type: GAUGE + + # Additional JVM OS metrics + - pattern: 'java.lang<>SystemLoadAverage' + name: jvm_system_cpu_load_1m + type: GAUGE + + - pattern: 'java.lang<>AvailableProcessors' + name: jvm_cpu_count + type: GAUGE + + - pattern: 'java.lang<>ProcessCpuLoad' + name: jvm_cpu_recent_utilization + type: GAUGE + + - pattern: 'java.lang<>OpenFileDescriptorCount' + name: jvm_file_descriptor_count + type: GAUGE + + # JVM Memory Pool + - pattern: 'java.langused' + name: jvm_memory_pool_used + type: GAUGE + labels: + name: "$1" + + - pattern: 'java.langmax' + name: jvm_memory_pool_max + type: GAUGE + labels: + name: "$1" + + - pattern: 'java.langused' + name: jvm_memory_pool_used_after_last_gc + type: GAUGE + labels: + name: "$1" +``` + + + **Customize metrics**: You can add or modify patterns by referencing the [Prometheus JMX Exporter examples](https://github.com/prometheus/jmx_exporter/tree/main/examples) and [Kafka MBean documentation](https://kafka.apache.org/documentation/#monitoring). Refer to the [JMX Exporter rules documentation](https://github.com/prometheus/jmx_exporter/blob/main/docs/content/1.5.0/http-mode/rules.md) for additional configurations. + + + + + + +### Configure Kafka brokers to use the JMX Exporter [#configure-brokers] -Attach the OpenTelemetry Java Agent to your Kafka broker by setting the `KAFKA_OPTS` environment variable before starting Kafka. +Attach the Prometheus JMX Exporter as a Java agent to each Kafka broker by adding it to your Kafka startup options. **Single broker example**: ```bash -OTEL_AGENT="$HOME/opentelemetry/opentelemetry-javaagent.jar" -JMX_CONFIG="$HOME/opentelemetry/jmx-custom-config.yaml" +JMX_JAR="$HOME/opentelemetry/jmx_prometheus_javaagent.jar" +JMX_CONFIG="$HOME/opentelemetry/kafka-jmx-config.yaml" -nohup env KAFKA_OPTS="-javaagent:$OTEL_AGENT \ - -Dotel.jmx.enabled=true \ - -Dotel.jmx.config=$JMX_CONFIG \ - -Dotel.resource.attributes=broker.id=1,kafka.cluster.name=my-kafka-cluster \ - -Dotel.exporter.otlp.endpoint=http://localhost:4317 \ - -Dotel.exporter.otlp.protocol=grpc \ - -Dotel.metrics.exporter=otlp \ - -Dotel.metric.export.interval=30000" \ - bin/kafka-server-start.sh config/server.properties & +nohup env KAFKA_OPTS="-javaagent:${JMX_JAR}=9404:${JMX_CONFIG}" \ + bin/kafka-server-start.sh config/server.properties & ``` - - **Multi-broker clusters**: For multiple brokers, use the same configuration with unique `broker.id` values (e.g., `broker.id=1`, `broker.id=2`, `broker.id=3`) in the `-Dotel.resource.attributes` parameter for each broker. - +Each broker will now expose Prometheus metrics on port `9404`. Verify: - - - * `nohup` - Runs the Kafka broker in the background, continuing even if the shell session ends - * `-javaagent` - Attaches the OpenTelemetry Java Agent to the Kafka broker JVM - * `-Dotel.jmx.enabled=true` enables JMX metrics collection - * `-Dotel.jmx.config` specifies your custom JMX metrics configuration file - * `-Dotel.resource.attributes` adds metadata: unique `broker.id` and `kafka.cluster.name` - * `-Dotel.exporter.otlp.endpoint` points to your OpenTelemetry Collector (default: localhost:4317) - * `-Dotel.exporter.otlp.protocol=grpc` uses gRPC protocol for OTLP - * `-Dotel.metrics.exporter=otlp` sends metrics via OTLP - * `-Dotel.metric.export.interval=30000` exports metrics every 30 seconds - * `&` - Runs the command in the background - - **For remote collectors** (different host): - ```bash - -Dotel.exporter.otlp.endpoint=http://collector-host:4317 - ``` +```bash +curl http://localhost:9404/metrics | grep kafka_ +``` - For complete configuration options, see the [Java Agent configuration guide](https://opentelemetry.io/docs/zero-code/java/agent/configuration/). - - + + **Multi-broker clusters**: Apply the same `KAFKA_OPTS` configuration to every broker. Each broker exposes metrics on port `9404` from its own host IP. + @@ -555,18 +1340,39 @@ nohup env KAFKA_OPTS="-javaagent:$OTEL_AGENT \ ### Create collector configuration [#collector-config] -Create the main OpenTelemetry Collector configuration at `~/opentelemetry/kafka-config.yaml`. +Create the OpenTelemetry Collector configuration at `~/opentelemetry/collector-kafka-config.yaml` **on a monitoring host**. + +The Prometheus receiver scrapes all broker endpoints. The collector listens on `0.0.0.0:4317` for any OTLP data (application traces, logs) in addition to scraping Prometheus endpoints. ```yaml receivers: - # OTLP receiver for Kafka and JMX metrics from Java agents and application telemetry + # OTLP receiver for application traces, metrics, and logs (listens on port 4317) otlp: protocols: grpc: endpoint: "0.0.0.0:4317" - # Kafka metrics receiver for cluster-level metrics - kafkametrics: + # Prometheus receiver scrapes JMX metrics from Kafka brokers + prometheus/kafka-jmx: + config: + scrape_configs: + - job_name: 'kafka-jmx-metrics' + metrics_path: /metrics + scrape_interval: 30s + static_configs: + # TODO: Replace each target with your broker hostname or IP, and set a unique broker.id per broker + - targets: ['broker1-host:9404'] + labels: + broker.id: '0' + - targets: ['broker2-host:9404'] + labels: + broker.id: '1' + - targets: ['broker3-host:9404'] + labels: + broker.id: '2' + + # Kafka metrics receiver for cluster-level consumer lag, topic, and partition metrics + kafkametrics/cluster: brokers: ${env:KAFKA_BOOTSTRAP_BROKER_ADDRESSES} protocol_version: 2.0.0 scrapers: @@ -588,89 +1394,128 @@ receivers: kafka.partition.current_offset: enabled: false +exporters: + otlp/backend: + endpoint: ${env:NEW_RELIC_OTLP_ENDPOINT} + headers: + api-key: ${env:NEW_RELIC_LICENSE_KEY} + tls: + insecure: false + sending_queue: + num_consumers: 12 + queue_size: 5000 + retry_on_failure: + enabled: true + processors: - batch/aggregation: + # Batch processor for efficient export + batch/export: send_batch_size: 1024 timeout: 30s - resourcedetection: - detectors: [env, ec2, system] - system: - resource_attributes: - host.name: - enabled: true - host.id: - enabled: true + # Memory limiter to prevent OOM + memory_limiter: + limit_percentage: 80 + spike_limit_percentage: 30 + check_interval: 1s - resource: + # Transform metric naming conventions (underscore to dot, normalize special names) + transform/metric-naming: + metric_statements: + - context: metric + statements: + - replace_pattern(name, "_", ".") + - replace_pattern(name, "\\.load\\.1", ".load_1") + - replace_pattern(name, "\\.recent\\.util", ".recent_util") + - replace_pattern(name, "file\\.descriptor\\.count", "file_descriptor.count") + - replace_pattern(name, "\\.memory\\.pool\\.used\\.bytes$", ".memory.pool.used") + - replace_pattern(name, "\\.memory\\.pool\\.max\\.bytes$", ".memory.pool.max") + - replace_pattern(name, "\\.memory\\.pool\\.collection\\.used\\.bytes$", ".memory.pool.used_after_last_gc") + - replace_pattern(name, "\\.non\\.preferred\\.leader", ".non_preferred_leader") + - replace_pattern(name, "\\.under\\.min\\.isr", ".under_min_isr") + - replace_pattern(name, "\\.under\\.replicated", ".under_replicated") + - replace_pattern(name, "\\.total$", "") where name != "kafka.request.time.total" + - context: datapoint + statements: + - set(attributes["name"], attributes["gc"]) where attributes["gc"] != nil + - delete_key(attributes, "gc") where attributes["gc"] != nil + - set(attributes["name"], attributes["pool"]) where attributes["pool"] != nil + - delete_key(attributes, "pool") where attributes["pool"] != nil + + # Add cluster name to all metrics + resource/cluster-name: attributes: - - action: insert - key: kafka.cluster.name + - key: kafka.cluster.name + # TODO: Replace with your Kafka cluster name value: ${env:KAFKA_CLUSTER_NAME} + action: upsert + # Remove broker.id for cluster-level metrics transform/remove_broker_id: metric_statements: - # Remove broker.id from resource attributes for cluster-level metrics - - context: resource + - context: datapoint statements: - delete_key(attributes, "broker.id") - transform/remove_extra_attributes: - metric_statements: - - context: resource - statements: - # Delete all attributes starting with "process." - - delete_matching_keys(attributes, "^process\\..*") - # Delete all attributes starting with "telemetry." - - delete_matching_keys(attributes, "^telemetry\\..*") - - delete_key(attributes, "host.arch") - - delete_key(attributes, "os.description") - - delete_key(attributes, "host.image.id") - - delete_key(attributes, "host.type") - - delete_matching_keys(attributes, "^cloud\\..*") - - delete_key(attributes, "service.instance.id") where IsMatch(attributes["service.name"], "^unknown_service:") - - delete_key(attributes, "service.name") where IsMatch(attributes["service.name"], "^unknown_service:") - - # Filter internal Kafka topics as a safety net (kafkametrics topic_match handles the receiver side) - filter/internal_topics: + # Filter out scrape overhead metrics + filter/scrape-overhead: metrics: - datapoint: - - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' - + exclude: + match_type: regexp + metric_names: + - "^jmx_.*" + - "^process_.*" + - "^jvm_buffer_pool_.*" + - "^jvm_threads_.*" + - "^jvm_classes_.*" + - "^jvm_memory_(heap|non_heap)_(committed|init|max|used)_bytes$" + - "^jvm_compilation_.*" + - "^jvm_(runtime|info).*" + - "^jvm_memory_pool_(allocated_bytes_total|committed_bytes|init_bytes|collection_(committed|init|max)_bytes)$" + + # Include only cluster-level metrics for the cluster pipeline filter/include_cluster_metrics: metrics: include: match_type: regexp metric_names: - - "kafka\\.partition\\.offline" - - "kafka\\.(leader|unclean)\\.election\\.rate" - - "kafka\\.partition\\.non_preferred_leader" - - "kafka\\.broker\\.fenced\\.count" - - "kafka\\.cluster\\.partition\\.count" - - "kafka\\.cluster\\.topic\\.count" - + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + + # Exclude cluster-level metrics from the broker pipeline filter/exclude_cluster_metrics: metrics: exclude: match_type: regexp metric_names: - - "kafka\\.partition\\.offline" - - "kafka\\.(leader|unclean)\\.election\\.rate" - - "kafka\\.partition\\.non_preferred_leader" - - "kafka\\.broker\\.fenced\\.count" - - "kafka\\.cluster\\.partition\\.count" - - "kafka\\.cluster\\.topic\\.count" - - transform/des_units: + - "^kafka\\.partition\\.offline$" + - "^kafka\\.(leader|unclean)\\.election\\.rate$" + - "^kafka\\.partition\\.non_preferred_leader$" + - "^kafka\\.broker\\.fenced\\.count$" + - "^kafka\\.cluster\\.partition\\.count$" + - "^kafka\\.cluster\\.topic\\.count$" + + # Remove unnecessary attributes + transform/remove_attributes: metric_statements: - context: metric statements: - set(description, "") where description != "" - set(unit, "") where unit != "" - - cumulativetodelta: - - metricstransform/kafka_topic_sum_aggregation: + - context: resource + statements: + - delete_key(attributes, "server.address") + - delete_key(attributes, "server.port") + - delete_key(attributes, "service.instance.id") + - delete_key(attributes, "host.name") + - delete_key(attributes, "url.scheme") + + # Aggregate partition metrics to topic level + metricstransform/topic-aggregation: transforms: - include: kafka.partition.replicas_in_sync action: insert @@ -679,7 +1524,6 @@ processors: - action: aggregate_labels label_set: [topic] aggregation_type: sum - - include: kafka.partition.replicas action: insert new_name: kafka.partition.replicas.total @@ -688,13 +1532,23 @@ processors: label_set: [topic] aggregation_type: sum - filter/remove_partition_level_replicas: + # Filter out original partition replicas metric + filter/exclude_partition_replicas_metric: metrics: exclude: match_type: strict metric_names: - kafka.partition.replicas_in_sync + # Filter internal Kafka topics as a safety net + filter/internal_topics: + metrics: + datapoint: + - 'attributes["topic"] != nil and IsMatch(attributes["topic"], "^__.*")' + + # Convert cumulative to delta metrics + cumulativetodelta: + groupbyattrs/cluster: keys: [kafka.cluster.name] @@ -708,59 +1562,76 @@ processors: aggregation_type: max label_set: [] -exporters: - otlp/newrelic: - endpoint: ${env:NEW_RELIC_OTLP_ENDPOINT} - headers: - api-key: ${env:NEW_RELIC_LICENSE_KEY} - compression: gzip - timeout: 30s - service: pipelines: - # Broker metrics pipeline (excludes cluster-level metrics) + # Application traces from instrumented Kafka clients and apps + traces: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + + # Application metrics from instrumented Kafka clients and apps + metrics: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + + # Application logs from instrumented Kafka clients and apps + logs: + receivers: [otlp] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] + + # Broker-level metrics from Prometheus JMX scraping metrics/broker: - receivers: [otlp, kafkametrics] + receivers: + - prometheus/kafka-jmx processors: - - resourcedetection - - resource + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes - filter/exclude_cluster_metrics - - filter/internal_topics - - transform/remove_extra_attributes - - transform/des_units + - memory_limiter - cumulativetodelta - - metricstransform/kafka_topic_sum_aggregation - - filter/remove_partition_level_replicas - - batch/aggregation - exporters: [otlp/newrelic] + - batch/export + exporters: + - otlp/backend - # Cluster metrics pipeline (only cluster-level metrics, no broker.id) - metrics/cluster: - receivers: [otlp] + # Cluster-level metrics from Prometheus JMX scraping + metrics/cluster/prometheus: + receivers: + - prometheus/kafka-jmx processors: - - resourcedetection - - resource + - resource/cluster-name + - filter/scrape-overhead + - transform/metric-naming + - transform/remove_attributes - filter/include_cluster_metrics - transform/remove_broker_id - - transform/remove_extra_attributes - - transform/des_units + - memory_limiter - cumulativetodelta - groupbyattrs/cluster - metricstransform/cluster_max - - batch/aggregation - exporters: [otlp/newrelic] - - # APM traces pipeline (producer + consumer spans via OTel Java Agent) - traces/apps: - receivers: [otlp] - processors: [resourcedetection, resource, batch/aggregation] - exporters: [otlp/newrelic] + - batch/export + exporters: + - otlp/backend - # APM logs pipeline (producer + consumer logs via OTel Java Agent) - logs/apps: - receivers: [otlp] - processors: [resourcedetection, resource, batch/aggregation] - exporters: [otlp/newrelic] + # Cluster-level metrics from Kafka metrics receiver (consumer lag, topics, partitions) + metrics/cluster/kafkametrics: + receivers: + - kafkametrics/cluster + processors: + - resource/cluster-name + - filter/internal_topics + - transform/remove_attributes + - metricstransform/topic-aggregation + - filter/exclude_partition_replicas_metric + - memory_limiter + - cumulativetodelta + - batch/export + exporters: + - otlp/backend ``` @@ -770,19 +1641,18 @@ service: > **Architecture highlights:** - * **OTLP receiver**: Receives Kafka and JMX metrics from OpenTelemetry Java Agent running on Kafka brokers via gRPC on port 4317 - * **Two pipelines approach**: Cluster-level metrics are sent without broker.id to map to cluster entity - * **Metric filtering**: Separates broker-specific metrics from cluster-level metrics to avoid duplication - * **Aggregation**: Automatically aggregates partition-level metrics by topic + * **OTLP receiver**: Listens on `0.0.0.0:4317` for application traces, metrics, and logs from instrumented Kafka clients + * **Prometheus receiver**: Scrapes each broker's `/metrics` endpoint on port `9404` using static host targets + * **Kafka metrics receiver**: Connects to the Kafka bootstrap port for consumer lag, topic, and partition metrics not available via JMX + * **Six pipelines**: Application traces/metrics/logs (OTLP), broker metrics, cluster-level JMX metrics (aggregated), and cluster-level Kafka metrics (consumer lag) + * **Metric naming**: Transforms Prometheus `_`-separated names to `.`-separated names matching New Relic dashboards - For advanced configuration options, refer to these receiver documentation pages: - - * [OTLP receiver documentation](https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver) - OTLP receiver configuration options + * [Prometheus receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) - Additional receiver configuration options * [Kafka metrics receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kafkametricsreceiver) - Additional Kafka metrics configuration @@ -793,21 +1663,22 @@ service: ### Set environment variables [#env-vars] -Set the required environment variables before installing the collector: +Set the required environment variables **on the monitoring host** before starting the collector: ```bash export NEW_RELIC_LICENSE_KEY="YOUR_LICENSE_KEY" export KAFKA_CLUSTER_NAME="my-kafka-cluster" -export KAFKA_BOOTSTRAP_BROKER_ADDRESSES="localhost:9092" +export KAFKA_BOOTSTRAP_BROKER_ADDRESSES="broker1-host:9092,broker2-host:9092,broker3-host:9092" export NEW_RELIC_OTLP_ENDPOINT="https://otlp.nr-data.net:4317" # US region +# EU region: https://otlp.eu01.nr-data.net:4317 ``` Replace: * `YOUR_LICENSE_KEY` with your New Relic license key * `my-kafka-cluster` with a unique name for your Kafka cluster -* `localhost:9092` with your Kafka bootstrap broker address(es). For multiple brokers, use comma-separated list: `broker1:9092,broker2:9092,broker3:9092` -* OTLP endpoint: Uses `https://otlp.nr-data.net:4317` (US region) or `https://otlp.eu01.nr-data.net:4317` (EU region). -For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol) +* `broker1-host:9092,broker2-host:9092,broker3-host:9092` with your Kafka bootstrap broker addresses + +For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol). @@ -815,7 +1686,7 @@ For other endpoint configurations, see [Configure your OTLP endpoint](/docs/open ### Install and start the collector [#install-start-collector] -Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Collector: +Install and run the collector **on the monitoring host**. Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Collector: @@ -826,13 +1697,11 @@ Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Colle - **NRDOT Collector** is New Relic's distribution of OpenTelemetry Collector with New Relic support for assistance. + **NRDOT Collector** is New Relic's distribution of OpenTelemetry Collector with New Relic support for assistance. For more information, see the [NRDOT Collector GitHub repository](https://github.com/newrelic/nrdot-collector-releases/tree/main/distributions/nrdot-collector). **Download and install the binary** - Download and install the NRDOT Collector binary for your host operating system. The example below is for linux_amd64 architecture: - ```bash # Set version and architecture NRDOT_VERSION="1.9.0" @@ -849,44 +1718,32 @@ Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Colle # Verify installation nrdot-collector --version ``` - For other operating systems and architectures, visit [NRDOT Collector releases](https://github.com/newrelic/nrdot-collector-releases/releases/latest) and download the appropriate binary for your system. **Start the collector** - Run the collector with your configuration file to begin monitoring: - ```bash - nrdot-collector --config ~/opentelemetry/kafka-config.yaml + nrdot-collector --config ~/opentelemetry/collector-kafka-config.yaml ``` - The collector will start sending Kafka metrics to New Relic within a few minutes. + The collector will start scraping Kafka metrics and sending them to New Relic within a few minutes. **Download and install the binary** - Download and install the OpenTelemetry Collector Contrib binary for your host operating system. The example below is for linux_amd64 architecture: - ```bash - # Set version and architecture # Check https://github.com/open-telemetry/opentelemetry-collector-releases/releases/latest for the latest version OTEL_VERSION="" ARCH="amd64" - # Download the collector curl -L -o otelcol-contrib.tar.gz \ "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v${OTEL_VERSION}/otelcol-contrib_${OTEL_VERSION}_linux_${ARCH}.tar.gz" - # Extract the binary tar -xzf otelcol-contrib.tar.gz - - # Move to a location in PATH (optional) sudo mv otelcol-contrib /usr/local/bin/ - - # Verify installation otelcol-contrib --version ``` @@ -894,13 +1751,11 @@ Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Colle **Start the collector** - Run the collector with your configuration file to begin monitoring: - ```bash - otelcol-contrib --config ~/opentelemetry/kafka-config.yaml + otelcol-contrib --config ~/opentelemetry/collector-kafka-config.yaml ``` - The collector will start sending Kafka metrics to New Relic within a few minutes. + The collector will start scraping Kafka metrics and sending them to New Relic within a few minutes. @@ -915,16 +1770,24 @@ Choose between NRDOT Collector (New Relic's distribution) or OpenTelemetry Colle **Language support**: Currently, only Java applications are supported for Kafka client instrumentation using the OpenTelemetry Java Agent. -To collect application-level telemetry from your Kafka producer and consumer applications, use the OpenTelemetry Java Agent you downloaded in [Step 1](#download-java-agent). +To collect application-level telemetry from your Kafka producer and consumer applications, download the OpenTelemetry Java Agent if you haven't already: + +```bash +mkdir -p ~/opentelemetry +curl -L -o ~/opentelemetry/opentelemetry-javaagent.jar \ + https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar +``` Start your application with the agent: ```bash +OTEL_AGENT="$HOME/opentelemetry/opentelemetry-javaagent.jar" + java \ - -javaagent:$HOME/opentelemetry/opentelemetry-javaagent.jar \ + -javaagent:$OTEL_AGENT \ -Dotel.service.name="order-process-service" \ -Dotel.resource.attributes="kafka.cluster.name=my-kafka-cluster" \ - -Dotel.exporter.otlp.endpoint=http://localhost:4317 \ + -Dotel.exporter.otlp.endpoint=http://collector-host-ip:4317 \ -Dotel.exporter.otlp.protocol="grpc" \ -Dotel.metrics.exporter="otlp" \ -Dotel.traces.exporter="otlp" \ @@ -933,19 +1796,21 @@ java \ -Dotel.instrumentation.messaging.experimental.receive-telemetry.enabled="true" \ -Dotel.instrumentation.kafka.producer-propagation.enabled="true" \ -Dotel.instrumentation.kafka.enabled="true" \ + -Dotel.instrumentation.runtime-telemetry.enabled="false" \ -jar your-kafka-application.jar ``` Replace: * `order-process-service` with a unique name for your producer or consumer application * `my-kafka-cluster` with the same cluster name used in your collector configuration +* `collector-host-ip:4317` with the hostname or IP of the host running your OpenTelemetry Collector - The configuration above sends telemetry to an OpenTelemetry Collector running on localhost:4317. + The configuration above sends telemetry to an OpenTelemetry Collector running on `collector-host-ip:4317`. If you want a separate collector dedicated to application telemetry, create one with the following configuration: ```yaml @@ -977,17 +1842,9 @@ Replace: ``` - - This allows you to customize processing, add filters, or route to multiple backends. For other endpoint configurations, see [Configure your OTLP endpoint](/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol). -The Java Agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing: -* Request latencies -* Throughput metrics -* Error rates -* Distributed traces - -For advanced configuration, see the [Kafka instrumentation documentation](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/kafka). +The Java Agent provides [out-of-the-box Kafka instrumentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/out-of-the-box-instrumentation/) with zero code changes, capturing request latencies, throughput metrics, error rates, and distributed traces. For advanced configuration, see the [Kafka instrumentation documentation](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/kafka). @@ -1002,12 +1859,12 @@ To collect Kafka broker logs and send them to New Relic, configure the filelog r id="configure-log-collection" title="Configure log collection" > - Update your collector configuration at `~/opentelemetry/kafka-config.yaml` to add the filelog receiver. + Update your collector configuration at `~/opentelemetry/collector-kafka-config.yaml` to add the filelog receiver. **Add to receivers section**: ```yaml receivers: - # ... existing receivers (otlp, kafkametrics) ... + # ... existing receivers (otlp, prometheus/kafka-jmx, kafkametrics/cluster) ... # File log receiver for Kafka broker logs filelog/kafka_broker_1: @@ -1025,20 +1882,20 @@ To collect Kafka broker logs and send them to New Relic, configure the filelog r ```yaml service: pipelines: - # ... existing pipelines (metrics/broker, metrics/cluster) ... + # ... existing pipelines ... # Logs pipeline for Kafka broker logs - logs: + logs/broker: receivers: [filelog/kafka_broker_1] - processors: [batch/aggregation, resourcedetection] - exporters: [otlp/newrelic] + processors: [memory_limiter, batch/export] + exporters: [otlp/backend] ``` **Configuration notes:** - * Update `/path/to/kafka/logs/server.log` to your actual Kafka log file path (e.g., `~/kafka/logs/server.log`) + * Update `/path/to/kafka/logs/server.log` to your actual Kafka log file path * The `broker.id` resource attribute correlates logs with specific broker metrics and entities * For multiple brokers, create separate `filelog` receivers (e.g., `filelog/kafka_broker_2`, `filelog/kafka_broker_3`) with their respective broker IDs - * The `multiline` pattern assumes logs start with `[` - adjust if your log format differs + * The `multiline` pattern assumes logs start with `[` — adjust if your log format differs * Consider log volume and collection costs before enabling log forwarding * For complete configuration options, see the [filelog receiver documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) @@ -1046,9 +1903,9 @@ To collect Kafka broker logs and send them to New Relic, configure the filelog r ```bash # If running in foreground, stop with Ctrl+C and restart - nrdot-collector --config ~/opentelemetry/kafka-config.yaml + nrdot-collector --config ~/opentelemetry/collector-kafka-config.yaml # Or for OpenTelemetry Collector - otelcol-contrib --config ~/opentelemetry/kafka-config.yaml + otelcol-contrib --config ~/opentelemetry/collector-kafka-config.yaml ``` @@ -1072,25 +1929,44 @@ To collect Kafka broker logs and send them to New Relic, configure the filelog r -## Advanced: Customize metrics collection [#customize-metrics] + + + + +## Find your data [#find-data] -You can add more Kafka metrics by extending the rules in `jmx-custom-config.yaml`: +After a few minutes, your Kafka data should appear in New Relic. See [Find your data](/docs/opentelemetry/integrations/kafka/find-and-query-data) for detailed instructions on exploring your Kafka data across different views in the New Relic UI. -- Learn about [OpenTelemetry JMX Metrics configuration syntax](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/jmx-metrics#jmx-metric-insight---custom-yaml-definitions) -- Find available MBean names in the [Kafka monitoring documentation](https://kafka.apache.org/documentation/#monitoring) +**Metrics** -This allows you to collect any JMX metric exposed by Kafka brokers based on your specific monitoring needs. +Broker, topic, partition, consumer group, and JVM metrics are stored in the `Metric` event type. Replace `my-kafka-cluster` with your `KAFKA_CLUSTER_NAME` value: -## Find your data [#find-data] +```sql +FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` + + +**Logs** + +Application logs from producer and consumer services instrumented with the OpenTelemetry Java Agent are stored in the `Log` event type: + +```sql +FROM Log SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago +``` -After a few minutes, your Kafka metrics should appear in New Relic. See [Find your data](/docs/opentelemetry/integrations/kafka/find-and-query-data) for detailed instructions on exploring your Kafka metrics across different views in the New Relic UI. +**Traces** -You can also query your data with NRQL: +Producer and consumer spans, including per-message `publish` and `receive` operations across topics, are stored in the `Span` event type: ```sql -FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' +FROM Span SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' SINCE 30 minutes ago ``` + +## Example [#example] + +A complete working example with Docker Compose setup, OTel Collector config, OTel Java Agent configuration, and sample producer/consumer applications is available in the [New Relic OpenTelemetry Examples repository](https://github.com/newrelic/newrelic-opentelemetry-examples/tree/main/other-examples/collector/kafka/self-host-kafka). + ## Troubleshooting [#troubleshooting] @@ -1112,7 +1988,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' ps aux | grep -E "[n]rdot-collector|[o]telcol" ``` - If no results appear, the collector is not running. Start it following Step 6. + If no results appear, the collector is not running. Start it following the [Install and start the collector](#install-start-collector) step. **Check if Java Agent is attached to Kafka brokers**: ```bash @@ -1159,13 +2035,13 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' level: "debug" # Enable detailed collector internal logs ``` - **Add debug exporter**: View metrics in collector logs before sending to New Relic + **Add debug exporter**: View metrics in collector logs before sending to New Relic. The processor and exporter names differ by monitoring method: + + **Java Agent method**: ```yaml exporters: debug: verbosity: detailed - sampling_initial: 5 # Log first 5 metrics - sampling_thereafter: 200 # Then log every 200th metric otlp/newrelic: endpoint: https://otlp.nr-data.net:4317 @@ -1178,15 +2054,44 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' pipelines: metrics/broker: receivers: [otlp, kafkametrics] - processors: [resourcedetection, resource, filter/exclude_cluster_metrics, transform/des_units, cumulativetodelta, metricstransform/kafka_topic_sum_aggregation, batch/aggregation] + processors: [resourcedetection, resource, filter/exclude_cluster_metrics, filter/internal_topics, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, metricstransform/kafka_topic_sum_aggregation, filter/remove_partition_level_replicas, batch/aggregation] exporters: [debug, otlp/newrelic] # Add debug exporter metrics/cluster: receivers: [otlp] - processors: [resourcedetection, resource, filter/include_cluster_metrics, transform/remove_broker_id, transform/des_units, cumulativetodelta, groupbyattrs/cluster, metricstransform/cluster_max, batch/aggregation] + processors: [resourcedetection, resource, filter/include_cluster_metrics, transform/remove_broker_id, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, groupbyattrs/cluster, metricstransform/cluster_max, batch/aggregation] exporters: [debug, otlp/newrelic] # Add debug exporter ``` + **Prometheus JMX Exporter method**: + ```yaml + exporters: + debug: + verbosity: detailed + + otlp/backend: + endpoint: ${env:NEW_RELIC_OTLP_ENDPOINT} + headers: + api-key: ${env:NEW_RELIC_LICENSE_KEY} + + service: + pipelines: + metrics/broker: + receivers: [prometheus/kafka-jmx] + processors: [resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/exclude_cluster_metrics, memory_limiter, cumulativetodelta, batch/export] + exporters: [debug, otlp/backend] # Add debug exporter + + metrics/cluster/prometheus: + receivers: [prometheus/kafka-jmx] + processors: [resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/include_cluster_metrics, transform/remove_broker_id, memory_limiter, cumulativetodelta, groupbyattrs/cluster, metricstransform/cluster_max, batch/export] + exporters: [debug, otlp/backend] # Add debug exporter + + metrics/cluster/kafkametrics: + receivers: [kafkametrics/cluster] + processors: [resource/cluster-name, filter/internal_topics, transform/remove_attributes, metricstransform/topic-aggregation, filter/exclude_partition_replicas_metric, memory_limiter, cumulativetodelta, batch/export] + exporters: [debug, otlp/backend] # Add debug exporter + ``` + Then restart the collector and check logs: ```bash # Check collector output log @@ -1233,7 +2138,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' ls -lh nohup.out 2>/dev/null && tail -100 nohup.out | grep -i "otel\|jmx" || echo "No nohup.out file found" ``` - **Verify Java Agent configuration**: Ensure the startup command matches Step 3 + **Verify Java Agent configuration**: Ensure the startup command matches the [Configure Kafka broker](#configure-broker) step ```bash # Check if broker was started with correct Java agent parameters ps aux | grep "[o]pentelemetry-javaagent" | grep -o "Dotel\.[^ ]*" @@ -1323,6 +2228,8 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' ``` **4. Optimize batch processing**: Reduce in-memory batch size + + Java Agent method — update `batch/aggregation`: ```yaml processors: batch/aggregation: @@ -1330,7 +2237,17 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' timeout: 60s ``` + Prometheus JMX Exporter method — update `batch/export`: + ```yaml + processors: + batch/export: + send_batch_size: 512 # Reduce from 1024 + timeout: 30s + ``` + **5. Add a memory limiter**: Prevent the collector from exceeding a memory threshold + + Java Agent method: ```yaml processors: memory_limiter: @@ -1346,7 +2263,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' pipelines: metrics/broker: receivers: [otlp, kafkametrics] - processors: [memory_limiter, resourcedetection, resource, filter/exclude_cluster_metrics, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, metricstransform/kafka_topic_sum_aggregation, filter/remove_partition_level_replicas, batch/aggregation] + processors: [memory_limiter, resourcedetection, resource, filter/exclude_cluster_metrics, filter/internal_topics, transform/remove_extra_attributes, transform/des_units, cumulativetodelta, metricstransform/kafka_topic_sum_aggregation, filter/remove_partition_level_replicas, batch/aggregation] exporters: [otlp/newrelic] metrics/cluster: receivers: [otlp] @@ -1354,16 +2271,44 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' exporters: [otlp/newrelic] ``` + Prometheus JMX Exporter method: + ```yaml + processors: + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + + batch/export: + send_batch_size: 512 + timeout: 30s + + service: + pipelines: + metrics/broker: + receivers: [prometheus/kafka-jmx] + processors: [memory_limiter, resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/exclude_cluster_metrics, cumulativetodelta, batch/export] + exporters: [otlp/backend] + metrics/cluster/prometheus: + receivers: [prometheus/kafka-jmx] + processors: [memory_limiter, resource/cluster-name, filter/scrape-overhead, transform/metric-naming, transform/remove_attributes, filter/include_cluster_metrics, transform/remove_broker_id, cumulativetodelta, groupbyattrs/cluster, metricstransform/cluster_max, batch/export] + exporters: [otlp/backend] + metrics/cluster/kafkametrics: + receivers: [kafkametrics/cluster] + processors: [memory_limiter, resource/cluster-name, filter/internal_topics, transform/remove_attributes, metricstransform/topic-aggregation, filter/exclude_partition_replicas_metric, cumulativetodelta, batch/export] + exporters: [otlp/backend] + ``` + **6. Restart the collector after changes**: ```bash # Find the collector process ID and stop it - pkill -f "kafka-config.yaml" + pkill -f "collector-kafka-config.yaml" # Restart NRDOT Collector - nrdot-collector --config ~/opentelemetry/kafka-config.yaml + nrdot-collector --config ~/opentelemetry/collector-kafka-config.yaml # Or restart OpenTelemetry Collector - otelcol-contrib --config ~/opentelemetry/kafka-config.yaml + otelcol-contrib --config ~/opentelemetry/collector-kafka-config.yaml ``` @@ -1375,7 +2320,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' **1. Remove the Additional metrics section from the JMX config** - In `~/opentelemetry/jmx-custom-config.yaml`, delete everything below this comment (through the end of the file): + In `~/opentelemetry/kafka-jmx-config.yaml`, delete everything below this comment (through the end of the file): ```yaml # ── Additional metrics — remove this section to reduce data ingest ─────────── @@ -1386,7 +2331,7 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' **2. Disable consumer offset metrics in the kafkametrics receiver** - In `~/opentelemetry/kafka-config.yaml`, add to the `kafkametrics` receiver `metrics` section: + In `~/opentelemetry/collector-kafka-config.yaml`, add to the `kafkametrics` receiver `metrics` section: ```yaml receivers: @@ -1407,4 +2352,4 @@ FROM Metric SELECT * WHERE kafka.cluster.name = 'my-kafka-cluster' * [Explore Kafka metrics](/docs/opentelemetry/integrations/kafka/metrics-reference) - View the complete metrics reference * [Create custom dashboards](/docs/query-your-data/explore-query-data/dashboards/introduction-dashboards) - Build visualizations for your Kafka data -* [Set up alerts](/docs/opentelemetry/integrations/kafka/metrics-reference/#alerting) - Monitor critical metrics like consumer lag and under-replicated partitions +* [Set up alerts](/docs/opentelemetry/integrations/kafka/find-and-query-data#alerts) - Monitor critical metrics like consumer lag and under-replicated partitions diff --git a/src/nav/opentelemetry.yml b/src/nav/opentelemetry.yml index 83ae70147ee..3fc51a6c4e7 100644 --- a/src/nav/opentelemetry.yml +++ b/src/nav/opentelemetry.yml @@ -92,11 +92,13 @@ pages: path: /docs/opentelemetry/integrations/elasticsearch/troubleshooting - title: Kafka integration pages: - - title: OTel Kafka overview + - title: Overview path: /docs/opentelemetry/integrations/kafka/overview - title: Self-hosted Kafka path: /docs/opentelemetry/integrations/kafka/self-hosted - - title: Kubernetes (Strimzi) + - title: Kubernetes self-managed + path: /docs/opentelemetry/integrations/kafka/kubernetes-self-managed + - title: Kubernetes Strimzi path: /docs/opentelemetry/integrations/kafka/kubernetes-strimzi - title: Find and query your data path: /docs/opentelemetry/integrations/kafka/find-and-query-data diff --git a/static/images/otel-kafka-self-host-architecture.webp b/static/images/otel-kafka-self-host-architecture.webp index 89e08f83955..cc40b3365f6 100644 Binary files a/static/images/otel-kafka-self-host-architecture.webp and b/static/images/otel-kafka-self-host-architecture.webp differ diff --git a/static/images/otel-kafka-self-managed-k8s-architecture.webp b/static/images/otel-kafka-self-managed-k8s-architecture.webp new file mode 100644 index 00000000000..e4ae78064cf Binary files /dev/null and b/static/images/otel-kafka-self-managed-k8s-architecture.webp differ