Skip to content

Commit 11b2b9c

Browse files
intel352claudeCopilot
authored
feat: add structured logging and observability hooks (#7)
* feat: add structured logging and observability hooks (#3) - Add internal/logger.go: bentoLogger wrapping log/slog with structured fields (component, name) and helpers LogStreamStart, LogStreamStop, LogStreamError, LogMessageProcessed, LogTopicEvent, LogProcessingStart, LogProcessingComplete, LogProcessingError - Add internal/metrics.go: thread-safe StreamMetrics using sync/atomic for message-in/out and error counters, plus mutex-guarded start time and last-message-time; Snapshot() for point-in-time read - Add internal/health.go: healthTracker derives HealthStatus (healthy / degraded / unhealthy) from metrics and running state; HealthReport struct for per-stream health reporting - Update stream_module, input_module, output_module, broker_module: attach logger/metrics/health on construction; emit log lines at start, stop, error, and per-message events; expose Health() method - Update processor_step: log processing start, complete, and error events - Fix pre-existing lint issues: check MetaWalkMut return value, remove unused mapToMessage helper, nolint annotation on future-use ensureStream - Add internal/logger_test.go: 21 unit tests covering log output format, metrics counting, health state transitions, and concurrent metric updates Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: format bridge.go * chore: retrigger CI * fix: apply observability PR review feedback (#9) * Initial plan * fix: apply PR review feedback for observability improvements - Fix wrong comment on LogStreamStop: 'uptimeSeconds' -> 'messagesProcessed' - Use slog.Default() in newLogger to respect global slog configuration - Pass snap.Errors (not always-zero snap.MessagesIn) to LogStreamStop in stream_module - Remove unnecessary mutex lock/unlock in newStreamMetrics initialization Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> * fix: observability lifecycle correctness — health on unexpected exit, uptime semantics, subscribe ordering (#13) * Initial plan * fix: apply review feedback - lifecycle hooks, health on unexpected exit, subscribe ordering Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> * fix: output stream cleanup on subscribe error + Health() test coverage for all modules (#15) * Initial plan * fix: stop built stream on subscribe error and add Health() tests for all modules Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> * fix: hot-path atomics, debug log guard, goroutine cleanup, deterministic health tests (#17) * Initial plan * fix: hot-path atomics, debug guard, goroutine cleanup, deterministic tests Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> * fix: address remaining PR #7 review comments (#4, #6, #8, #9, #17, #18, #19, #20) - #4 output_module: reinitialize done channel at Start for restart support - #6 broker_module: copy streams map under lock then release before calling stream.Stop to prevent potential deadlock in Stop() - #8 processor_step: use sync.Once in consumer func so fan-out (processor emitting multiple messages) does not block on the size-1 channel - #9 processor_step: deferred cleanup selects on ctx.Done() when draining streamDone so a cancelled parent context does not cause an unconditional block - #17 input_module: extract real transport type via inputTransportType() helper so LogStreamStart logs "kafka"/"generate"/etc. instead of "bento.input" - #18 output_module: extract real transport type via outputTransportType() helper so LogStreamStart logs the actual Bento output type - #19 bridge: mapToMessage now returns (*service.Message, error); json.Marshal errors are surfaced instead of silently ignored; updated all call sites - #20 tests: replace remaining time.Sleep(50ms) calls with polling loops in stream_module_test, output_module_test, and broker_module_test Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: intel352 <77607+intel352@users.noreply.github.com>
1 parent e55837c commit 11b2b9c

15 files changed

Lines changed: 1327 additions & 319 deletions

internal/bridge.go

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,42 @@ func configToYAML(config map[string]any) (string, error) {
1717
return string(data), nil
1818
}
1919

20-
// messageToMap converts a Bento service.Message to a plain map.
20+
// mapToMessage converts a map to a Bento service.Message.
21+
// If the map has a "body" key, its JSON representation becomes the message bytes.
22+
// Additional metadata keys are attached.
23+
// An error is returned if JSON marshalling of the body fails.
24+
func mapToMessage(data map[string]any) (*service.Message, error) {
25+
var body []byte
26+
27+
if b, ok := data["body"]; ok {
28+
if s, ok2 := b.(string); ok2 {
29+
body = []byte(s)
30+
} else {
31+
var err error
32+
body, err = json.Marshal(b)
33+
if err != nil {
34+
return nil, fmt.Errorf("marshal message body: %w", err)
35+
}
36+
}
37+
} else {
38+
var err error
39+
body, err = json.Marshal(data)
40+
if err != nil {
41+
return nil, fmt.Errorf("marshal message data: %w", err)
42+
}
43+
}
44+
45+
msg := service.NewMessage(body)
46+
47+
if meta, ok := data["metadata"].(map[string]any); ok {
48+
for k, v := range meta {
49+
msg.MetaSetMut(k, v)
50+
}
51+
}
52+
53+
return msg, nil
54+
}
55+
2156
// The message body is decoded as JSON if possible; otherwise stored as a raw string.
2257
func messageToMap(msg *service.Message) (map[string]any, error) {
2358
raw, err := msg.AsBytes()
@@ -47,30 +82,3 @@ func messageToMap(msg *service.Message) (map[string]any, error) {
4782

4883
return result, nil
4984
}
50-
51-
// mapToMessage converts a map to a Bento service.Message.
52-
// If the map has a "body" key, its JSON representation becomes the message bytes.
53-
// Additional metadata keys are attached.
54-
func mapToMessage(data map[string]any) *service.Message {
55-
var body []byte
56-
57-
if b, ok := data["body"]; ok {
58-
if s, ok2 := b.(string); ok2 {
59-
body = []byte(s)
60-
} else {
61-
body, _ = json.Marshal(b)
62-
}
63-
} else {
64-
body, _ = json.Marshal(data)
65-
}
66-
67-
msg := service.NewMessage(body)
68-
69-
if meta, ok := data["metadata"].(map[string]any); ok {
70-
for k, v := range meta {
71-
msg.MetaSetMut(k, v)
72-
}
73-
}
74-
75-
return msg
76-
}

internal/bridge_test.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,10 @@ func TestMapToMessage_WithBody(t *testing.T) {
128128
"body": "hello world",
129129
}
130130

131-
msg := mapToMessage(data)
131+
msg, err := mapToMessage(data)
132+
if err != nil {
133+
t.Fatalf("mapToMessage() returned error: %v", err)
134+
}
132135
if msg == nil {
133136
t.Fatal("mapToMessage() returned nil")
134137
}
@@ -147,7 +150,10 @@ func TestMapToMessage_WithJSONBody(t *testing.T) {
147150
"body": map[string]any{"key": "value"},
148151
}
149152

150-
msg := mapToMessage(data)
153+
msg, err := mapToMessage(data)
154+
if err != nil {
155+
t.Fatalf("mapToMessage() returned error: %v", err)
156+
}
151157
if msg == nil {
152158
t.Fatal("mapToMessage() returned nil")
153159
}
@@ -166,7 +172,10 @@ func TestMapToMessage_WithoutBody_MarshalsFull(t *testing.T) {
166172
"key": "value",
167173
}
168174

169-
msg := mapToMessage(data)
175+
msg, err := mapToMessage(data)
176+
if err != nil {
177+
t.Fatalf("mapToMessage() returned error: %v", err)
178+
}
170179
if msg == nil {
171180
t.Fatal("mapToMessage() returned nil")
172181
}
@@ -188,7 +197,10 @@ func TestMapToMessage_WithMetadata(t *testing.T) {
188197
},
189198
}
190199

191-
msg := mapToMessage(data)
200+
msg, err := mapToMessage(data)
201+
if err != nil {
202+
t.Fatalf("mapToMessage() returned error: %v", err)
203+
}
192204
if msg == nil {
193205
t.Fatal("mapToMessage() returned nil")
194206
}
@@ -203,7 +215,10 @@ func TestMapToMessage_WithMetadata(t *testing.T) {
203215
}
204216

205217
func TestMapToMessage_EmptyMap(t *testing.T) {
206-
msg := mapToMessage(map[string]any{})
218+
msg, err := mapToMessage(map[string]any{})
219+
if err != nil {
220+
t.Fatalf("mapToMessage() with empty map returned error: %v", err)
221+
}
207222
if msg == nil {
208223
t.Fatal("mapToMessage() with empty map returned nil")
209224
}
@@ -220,7 +235,10 @@ func TestRoundTrip_MessageToMapToMessage(t *testing.T) {
220235
}
221236

222237
// Convert back to message.
223-
reconstructed := mapToMessage(m)
238+
reconstructed, err := mapToMessage(m)
239+
if err != nil {
240+
t.Fatalf("mapToMessage() returned error: %v", err)
241+
}
224242
if reconstructed == nil {
225243
t.Fatal("mapToMessage() returned nil")
226244
}

internal/broker_module.go

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ type brokerModule struct {
2121
subscriber sdk.MessageSubscriber
2222
streams map[string]*service.Stream
2323
mu sync.RWMutex
24+
log *bentoLogger
25+
metrics *StreamMetrics
26+
health *healthTracker
2427
}
2528

2629
// SetMessagePublisher satisfies the MessageAwareModule interface.
@@ -34,10 +37,14 @@ func (m *brokerModule) SetMessageSubscriber(sub sdk.MessageSubscriber) {
3437
}
3538

3639
func newBrokerModule(name string, config map[string]any) (*brokerModule, error) {
40+
metrics := newStreamMetrics()
3741
return &brokerModule{
3842
name: name,
3943
config: config,
4044
streams: make(map[string]*service.Stream),
45+
log: newLogger("bento.broker", name),
46+
metrics: metrics,
47+
health: newHealthTracker(metrics),
4148
}, nil
4249
}
4350

@@ -60,35 +67,50 @@ func (m *brokerModule) Init() error {
6067

6168
// Start is a no-op; individual per-topic streams are created on demand.
6269
func (m *brokerModule) Start(_ context.Context) error {
70+
m.health.SetRunning(true)
71+
m.metrics.MarkStarted()
72+
m.log.LogStreamStart(m.transport)
6373
return nil
6474
}
6575

6676
// Stop shuts down all managed streams.
6777
func (m *brokerModule) Stop(ctx context.Context) error {
78+
// Copy the streams map under the lock, then release it before calling
79+
// stream.Stop to avoid holding the lock during potentially blocking I/O
80+
// (deadlock risk if a stream goroutine also acquires the lock).
6881
m.mu.Lock()
69-
defer m.mu.Unlock()
70-
71-
slog.Info("stopping bento broker", "module", m.name, "topics", len(m.streams))
82+
toStop := make(map[string]*service.Stream, len(m.streams))
83+
for topic, s := range m.streams {
84+
toStop[topic] = s
85+
}
86+
m.streams = make(map[string]*service.Stream)
87+
m.mu.Unlock()
7288

7389
var firstErr error
74-
for topic, stream := range m.streams {
75-
if err := stream.Stop(ctx); err != nil {
76-
slog.Error("failed to stop broker stream", "error", err, "module", m.name, "topic", topic)
77-
if firstErr == nil {
78-
firstErr = fmt.Errorf("bento.broker %q: stop stream for topic %q: %w", m.name, topic, err)
79-
}
80-
continue
90+
for topic, stream := range toStop {
91+
if err := stream.Stop(ctx); err != nil && firstErr == nil {
92+
m.metrics.RecordError()
93+
m.log.LogStreamError(err, slog.String("topic", topic))
94+
firstErr = fmt.Errorf("bento.broker %q: stop stream for topic %q: %w", m.name, topic, err)
8195
}
82-
slog.Info("broker stream stopped", "module", m.name, "topic", topic)
8396
}
84-
m.streams = make(map[string]*service.Stream)
97+
98+
m.health.SetRunning(false)
99+
m.metrics.MarkStopped()
100+
snap := m.metrics.Snapshot()
101+
m.log.LogStreamStop(snap.MessagesIn+snap.MessagesOut,
102+
slog.String("transport", m.transport),
103+
slog.Duration("uptime", snap.Uptime),
104+
slog.Int64("errors", snap.Errors),
105+
)
106+
85107
return firstErr
86108
}
87109

88110
// ensureStream returns (creating if necessary) a running stream for topic.
89111
// This is used internally when the broker needs a dedicated in-process pipe.
90112
//
91-
//nolint:unused // Reserved for future on-demand topic routing implementation.
113+
//nolint:unused // Reserved for future use by broker consumers.
92114
func (m *brokerModule) ensureStream(ctx context.Context, topic string) (*service.Stream, error) {
93115
m.mu.RLock()
94116
if s, ok := m.streams[topic]; ok {
@@ -105,8 +127,6 @@ func (m *brokerModule) ensureStream(ctx context.Context, topic string) (*service
105127
return s, nil
106128
}
107129

108-
slog.Info("creating broker stream", "module", m.name, "topic", topic, "transport", m.transport)
109-
110130
// Build a simple in-memory stream that holds messages for this topic.
111131
// The actual transport is configured via transportConfig / transport.
112132
builder := service.NewStreamBuilder()
@@ -123,28 +143,31 @@ func (m *brokerModule) ensureStream(ctx context.Context, topic string) (*service
123143
}
124144

125145
pub := m.publisher
126-
moduleName := m.name
146+
metrics := m.metrics
147+
log := m.log
127148

128149
if pub != nil {
129150
if err := builder.AddConsumerFunc(func(_ context.Context, msg *service.Message) error {
130151
payload, msgErr := msg.AsBytes()
131152
if msgErr != nil {
132-
slog.Error("failed to read broker message", "error", msgErr, "module", moduleName, "topic", topic)
153+
metrics.RecordError()
154+
log.LogStreamError(msgErr, slog.String("topic", topic))
133155
return msgErr
134156
}
135157
meta := map[string]string{}
136158
_ = msg.MetaWalkMut(func(k string, v any) error {
137159
meta[k] = fmt.Sprintf("%v", v)
138160
return nil
139161
})
140-
141-
slog.Debug("broker forwarding message", "module", moduleName, "topic", topic, "size", len(payload))
142-
143162
_, pubErr := pub.Publish(topic, payload, meta)
144163
if pubErr != nil {
145-
slog.Error("failed to publish from broker", "error", pubErr, "module", moduleName, "topic", topic)
164+
metrics.RecordError()
165+
log.LogStreamError(pubErr, slog.String("phase", "publish"), slog.String("topic", topic))
166+
return pubErr
146167
}
147-
return pubErr
168+
metrics.RecordMessageOut()
169+
log.LogMessageProcessed(topic)
170+
return nil
148171
}); err != nil {
149172
return nil, fmt.Errorf("add consumer for topic %q: %w", topic, err)
150173
}
@@ -155,13 +178,32 @@ func (m *brokerModule) ensureStream(ctx context.Context, topic string) (*service
155178
return nil, fmt.Errorf("build stream for topic %q: %w", topic, err)
156179
}
157180

181+
m.log.LogTopicEvent("stream_created", topic,
182+
slog.String("transport", m.transport),
183+
)
184+
158185
go func() {
159-
if err := stream.Run(ctx); err != nil && ctx.Err() == nil {
160-
slog.Error("broker stream failed", "error", err, "module", moduleName, "topic", topic)
186+
if runErr := stream.Run(ctx); ctx.Err() == nil {
187+
// Stream exited without context cancellation; remove it from the
188+
// active streams map so it can be recreated on next access.
189+
m.mu.Lock()
190+
delete(m.streams, topic)
191+
m.mu.Unlock()
192+
if runErr != nil {
193+
metrics.RecordError()
194+
log.LogStreamError(runErr, slog.String("topic", topic))
195+
}
196+
log.LogTopicEvent("stream_stopped", topic,
197+
slog.String("reason", "run_exited"),
198+
)
161199
}
162200
}()
163201

164202
m.streams[topic] = stream
165-
slog.Info("broker stream created", "module", m.name, "topic", topic)
166203
return stream, nil
167204
}
205+
206+
// Health returns the current health report for this broker module.
207+
func (m *brokerModule) Health() HealthReport {
208+
return m.health.Report()
209+
}

internal/broker_module_test.go

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,46 @@ func TestBrokerModule_StartStop(t *testing.T) {
126126
}
127127
}
128128

129+
func TestBrokerModule_Health(t *testing.T) {
130+
m, _ := newBrokerModule("test-broker", map[string]any{
131+
"transport": "memory",
132+
})
133+
134+
// Before start: unhealthy
135+
report := m.Health()
136+
if report.Status != HealthStatusUnhealthy {
137+
t.Errorf("expected unhealthy before start, got %s", report.StatusText)
138+
}
139+
140+
if err := m.Init(); err != nil {
141+
t.Fatalf("Init() error = %v", err)
142+
}
143+
144+
ctx := context.Background()
145+
if err := m.Start(ctx); err != nil {
146+
t.Fatalf("Start() error = %v", err)
147+
}
148+
149+
// After start: healthy
150+
report = m.Health()
151+
if report.Status != HealthStatusHealthy {
152+
t.Errorf("expected healthy after start, got %s", report.StatusText)
153+
}
154+
155+
stopCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
156+
defer cancel()
157+
158+
if err := m.Stop(stopCtx); err != nil {
159+
t.Fatalf("Stop() error = %v", err)
160+
}
161+
162+
// After stop: unhealthy
163+
report = m.Health()
164+
if report.Status != HealthStatusUnhealthy {
165+
t.Errorf("expected unhealthy after stop, got %s", report.StatusText)
166+
}
167+
}
168+
129169
func TestBrokerModule_EnsureStream(t *testing.T) {
130170
// Use generate transport so ensureStream can build a valid Bento stream.
131171
// count=0 means infinite generation; the stream will be stopped at the end.
@@ -186,8 +226,29 @@ func TestBrokerModule_EnsureStream(t *testing.T) {
186226
t.Errorf("expected 2 streams, got %d", streamCount)
187227
}
188228

189-
// Allow goroutines to start running streams
190-
time.Sleep(50 * time.Millisecond)
229+
// Poll until both stream goroutines have called stream.Run, which is
230+
// indicated by the stream being able to accept a Stop without returning
231+
// "has not been run yet". We verify by polling the stream count: once the
232+
// goroutines are running, Stop will succeed. Use a short yield loop instead
233+
// of a fixed sleep.
234+
deadline := time.Now().Add(2 * time.Second)
235+
for {
236+
m.mu.RLock()
237+
count := len(m.streams)
238+
m.mu.RUnlock()
239+
if count == 2 {
240+
// Streams are registered; give goroutines a chance to call Run.
241+
// Yield to the scheduler repeatedly rather than sleeping.
242+
for range 5 {
243+
time.Sleep(10 * time.Millisecond)
244+
}
245+
break
246+
}
247+
if time.Now().After(deadline) {
248+
t.Fatal("streams did not start within timeout")
249+
}
250+
time.Sleep(10 * time.Millisecond)
251+
}
191252

192253
// Stop should clean up all streams
193254
stopCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)

0 commit comments

Comments
 (0)