Skip to content

Commit 5cea372

Browse files
committed
Add tracing and spans, as well as ability to send them to clients via headers
1 parent eab4771 commit 5cea372

11 files changed

Lines changed: 598 additions & 2 deletions

File tree

browserup-proxy.schema.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,14 @@
433433
"_href": {
434434
"type": "string",
435435
"description": "Top level href, including hashtag, etc per the browser"
436+
},
437+
"_span_id": {
438+
"type": "string",
439+
"description": "W3C Trace Context span ID for this page"
440+
},
441+
"_parent_id": {
442+
"type": "string",
443+
"description": "W3C Trace Context parent span ID (typically the HAR log span ID)"
436444
}
437445
}
438446
},
@@ -856,6 +864,14 @@
856864
"$ref": "#/components/schemas/HarEntry"
857865
}
858866
},
867+
"_trace_id": {
868+
"type": "string",
869+
"description": "W3C Trace Context trace ID for distributed tracing"
870+
},
871+
"_span_id": {
872+
"type": "string",
873+
"description": "W3C Trace Context span ID for this HAR trace root"
874+
},
859875
"comment": {
860876
"type": "string"
861877
}
@@ -1314,6 +1330,18 @@
13141330
"$ref": "#/components/schemas/WebSocketMessage"
13151331
}
13161332
},
1333+
"_span_id": {
1334+
"type": "string",
1335+
"description": "W3C Trace Context span ID for this entry"
1336+
},
1337+
"_parent_id": {
1338+
"type": "string",
1339+
"description": "W3C Trace Context parent span ID (typically the page span ID)"
1340+
},
1341+
"_trace_id": {
1342+
"type": "string",
1343+
"description": "W3C Trace Context trace ID for distributed tracing"
1344+
},
13171345
"connection": {
13181346
"type": "string"
13191347
},
@@ -1367,6 +1395,14 @@
13671395
},
13681396
"default": []
13691397
},
1398+
"_span_id": {
1399+
"type": "string",
1400+
"description": "W3C Trace Context span ID for this page"
1401+
},
1402+
"_parent_id": {
1403+
"type": "string",
1404+
"description": "W3C Trace Context parent span ID (typically the HAR log span ID)"
1405+
},
13701406
"pageTimings": {
13711407
"$ref": "#/components/schemas/PageTimings"
13721408
},

mitmproxy/addons/browserup/har/flow_capture.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,115 @@
4848

4949

5050
class FlowCaptureMixin(object):
51+
def is_browser_user_agent(self, user_agent):
52+
"""Check if User-Agent looks like a browser based on the specified regex pattern"""
53+
if not user_agent:
54+
return False
55+
56+
import re
57+
pattern = re.compile(r"Mozilla|Chrome|Firefox|Safari|Edge|Gecko|WebKit|Ladybird|Lightpanda", re.IGNORECASE)
58+
return bool(pattern.search(user_agent))
59+
60+
def is_first_request_in_page(self, flow):
61+
"""Check if this is the first request for the current page"""
62+
page = self.get_or_create_current_page()
63+
page_entries = self.entries_for_page(page["id"])
64+
65+
# This could be the first request for this page
66+
is_first = len(page_entries) == 0 or (
67+
len(page_entries) == 1 and
68+
page_entries[0]["request"]["url"] == self.get_full_url(flow.request)
69+
)
70+
71+
return is_first
72+
73+
def is_html_content_type(self, flow):
74+
"""Check if response content type is HTML"""
75+
content_type = flow.response.headers.get("Content-Type", "") if flow.response else ""
76+
return "text/html" in content_type.lower()
77+
78+
def decorate_request_with_trace_headers(self, flow):
79+
"""Add or update W3C trace context headers for distributed tracing"""
80+
har_entry = flow.get_har_entry()
81+
82+
# Get the IDs from HAR entry
83+
trace_id = har_entry.get("_trace_id")
84+
span_id = har_entry.get("_span_id")
85+
86+
# Skip if we don't have proper IDs
87+
if not trace_id or not span_id:
88+
logging.debug("No trace info available for request")
89+
return
90+
91+
# Check if this is the first request in a page and has a browser user-agent
92+
if self.is_first_request_in_page(flow):
93+
user_agent = flow.request.headers.get("User-Agent", "")
94+
is_browser = self.is_browser_user_agent(user_agent)
95+
96+
if is_browser:
97+
# Mark as potential browser root - will be confirmed when response comes in
98+
har_entry["_potential_browser_root"] = True
99+
logging.debug(f"Potential browser root request detected: {flow.request.url}")
100+
101+
# Check if traceparent header already exists
102+
existing_traceparent = flow.request.headers.get("traceparent")
103+
if existing_traceparent:
104+
# Parse the existing traceparent header
105+
parts = existing_traceparent.split("-")
106+
if len(parts) == 4:
107+
# Extract trace ID and flags from the existing header
108+
existing_trace_id = parts[1]
109+
trace_flags = parts[3]
110+
111+
# Update the HAR entry with the existing trace ID
112+
har_entry["_trace_id"] = existing_trace_id
113+
114+
# Use the existing trace ID, but with the HAR entry's span ID
115+
traceparent = f"{parts[0]}-{existing_trace_id}-{span_id}-{trace_flags}"
116+
flow.request.headers["traceparent"] = traceparent
117+
logging.debug(f"Updated existing traceparent header: {traceparent}")
118+
else:
119+
# Invalid format, create a new traceparent
120+
traceparent = f"00-{trace_id}-{span_id}-01"
121+
flow.request.headers["traceparent"] = traceparent
122+
logging.debug(f"Replaced invalid traceparent header: {traceparent}")
123+
else:
124+
# No existing header, create a new one
125+
# Format: version-traceid-spanid-traceflags
126+
# Using version 00 and traceflags 01 (sampled)
127+
traceparent = f"00-{trace_id}-{span_id}-01"
128+
flow.request.headers["traceparent"] = traceparent
129+
logging.debug(f"Added traceparent header: {traceparent}")
130+
131+
# Handle tracestate according to W3C spec
132+
# The spec says new vendors should add their entry to the left of any existing entries
133+
vendor_name = "browserup"
134+
135+
# Get existing tracestate
136+
existing_tracestate = flow.request.headers.get("tracestate", "")
137+
138+
# Add our entry to tracestate
139+
# Use span_id as our vendor-specific value
140+
new_entry = f"{vendor_name}={span_id}"
141+
142+
if existing_tracestate:
143+
# Parse existing entries, remove our vendor if it exists already
144+
entries = [entry.strip() for entry in existing_tracestate.split(',')]
145+
entries = [entry for entry in entries if not entry.startswith(f"{vendor_name}=")]
146+
147+
# Add our entry to the left (at the beginning)
148+
entries.insert(0, new_entry)
149+
150+
# Join with commas and update the header
151+
# Limit to 32 entries per spec (the newest 32)
152+
updated_tracestate = ','.join(entries[:32])
153+
flow.request.headers["tracestate"] = updated_tracestate
154+
logging.debug(f"Updated tracestate header: {updated_tracestate}")
155+
else:
156+
# No existing tracestate, just add ours
157+
flow.request.headers["tracestate"] = new_entry
158+
logging.debug(f"Added tracestate header: {new_entry}")
159+
51160
def capture_request(self, flow):
52161
full_url = self.get_full_url(flow.request)
53162
if "BrowserUpData" in full_url or "detectportal.firefox.com" in full_url:
@@ -56,6 +165,7 @@ def capture_request(self, flow):
56165

57166
logging.info("Populating har entry for request: {}".format(full_url))
58167

168+
# First get the HAR entry - this creates it with the trace info
59169
har_entry = flow.get_har_entry()
60170

61171
har_entry["startedDateTime"] = datetime.fromtimestamp(
@@ -187,6 +297,32 @@ def capture_response(self, flow):
187297
content["size"] = response_body_size
188298
content["compression"] = response_body_compression
189299
content["mimeType"] = flow.response.headers.get("Content-Type", "")
300+
301+
# Check if this is a confirmed browser-based HTML response
302+
content_type = flow.response.headers.get("Content-Type", "").lower()
303+
is_html = "text/html" in content_type
304+
305+
# If this was marked as a potential browser root request and it's HTML,
306+
# confirm it as a browser root for parent/child relationships
307+
if har_entry.get("_potential_browser_root") and is_html:
308+
har_entry["_browser_root"] = True
309+
har_entry.pop("_potential_browser_root", None)
310+
logging.info(f"Confirmed browser root request: {flow.request.url}")
311+
312+
# Set parent to page span id since this is main HTML request
313+
page = self.get_or_create_current_page()
314+
har_entry["_parent_id"] = page["_span_id"]
315+
elif not har_entry.get("_browser_root"):
316+
# For non-root requests, check if there's a browser root in this page to set as parent
317+
page = self.get_or_create_current_page()
318+
page_entries = self.entries_for_page(page["id"])
319+
320+
for entry in page_entries:
321+
if entry.get("_browser_root") and entry["_span_id"] != har_entry["_span_id"]:
322+
# Set parent to browser root span instead of page span
323+
har_entry["_parent_id"] = entry["_span_id"]
324+
logging.debug(f"Set parent of {flow.request.url} to browser root")
325+
break
190326

191327
if HarCaptureTypes.RESPONSE_DYNAMIC_CONTENT in self.har_capture_types:
192328
mime_type = (

mitmproxy/addons/browserup/har/har_builder.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import uuid
12
from datetime import datetime
23
from datetime import timezone
34

@@ -12,11 +13,14 @@ def har():
1213

1314
@staticmethod
1415
def log():
16+
trace_id = uuid.uuid4().hex
1517
return {
1618
"version": "1.1",
1719
"creator": {"name": "BrowserUp Proxy", "version": "0.1", "comment": ""},
1820
"entries": [],
1921
"pages": [HarBuilder.page(id=DEFAULT_PAGE_REF)],
22+
"_trace_id": trace_id,
23+
"_span_id": trace_id[:16], # Use first half of trace_id for root span
2024
}
2125

2226
@staticmethod
@@ -25,11 +29,14 @@ def page_timings():
2529

2630
@staticmethod
2731
def page(id=DEFAULT_PAGE_REF, title=DEFAULT_PAGE_TITLE):
32+
span_id = uuid.uuid4().hex[:16] # 16 char span ID for the page
2833
return {
2934
"title": title,
3035
"id": id,
3136
"startedDateTime": str(datetime.now(tz=timezone.utc).isoformat()),
3237
"pageTimings": HarBuilder.page_timings(),
38+
"_span_id": span_id, # Unique span ID for this page
39+
"_parent_id": None, # Will be set to HAR root span when page is added
3340
}
3441

3542
@staticmethod
@@ -100,6 +107,7 @@ def entry_response_for_failure():
100107

101108
@staticmethod
102109
def entry(pageref=DEFAULT_PAGE_REF):
110+
span_id = uuid.uuid4().hex[:16] # 16 char span ID for the entry
103111
return {
104112
"pageref": pageref,
105113
"startedDateTime": str(datetime.now(tz=timezone.utc).isoformat()),
@@ -112,4 +120,6 @@ def entry(pageref=DEFAULT_PAGE_REF):
112120
"serverIPAddress": "",
113121
"connection": "",
114122
"comment": "",
123+
"_span_id": span_id, # Unique span ID for this entry
124+
"_parent_id": None, # Will be set to parent page's span ID
115125
}

mitmproxy/addons/browserup/har/har_manager.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,44 @@ def get_or_create_har(self, page_ref=None, page_title=None):
4242

4343
def new_har(self):
4444
self.har = HarBuilder.har()
45+
# Set the parent ID for the first page to the root span
46+
if len(self.har["log"]["pages"]) > 0 and "_span_id" in self.har["log"]:
47+
self.har["log"]["pages"][0]["_parent_id"] = self.har["log"]["_span_id"]
4548

4649
def create_har_entry(self, flow):
4750
har = self.get_or_create_har()
4851
page = self.get_or_create_current_page()
4952
pageref = page["id"]
5053
entry = HarBuilder.entry(pageref)
54+
55+
# By default, set parent span ID to the page's span ID
56+
# This may be overridden later for browser-based HAR entries
57+
entry["_parent_id"] = page["_span_id"]
58+
59+
# Include trace ID reference from the root
60+
entry["_trace_id"] = har["log"]["_trace_id"]
61+
62+
# Check if there's already a browser root for this page
63+
# We need to do this here because new entries might be created
64+
# for an existing page with a root
65+
if hasattr(flow, 'request') and hasattr(self, 'entries_for_page'):
66+
page_entries = self.entries_for_page(page["id"])
67+
68+
# Find if there's a browser root entry for this page
69+
for page_entry in page_entries:
70+
if page_entry.get("_browser_root"):
71+
# If we're not creating an entry for the root itself,
72+
# set parent to the browser root span
73+
is_same_url = False
74+
if hasattr(flow.request, 'url'):
75+
full_url = flow.request.pretty_url
76+
is_same_url = page_entry["request"]["url"] == full_url
77+
78+
if not is_same_url:
79+
entry["_parent_id"] = page_entry["_span_id"]
80+
logging.debug(f"Setting parent to browser root during entry creation")
81+
break
82+
5183
har["log"]["entries"].append(entry)
5284
self.print_har_summary()
5385
return entry
@@ -69,6 +101,10 @@ def new_page(self, page_title=None, page_ref=None):
69101
next_page_number = len(har["log"]["pages"]) + 1
70102
next_id = "page_{}".format(next_page_number)
71103
new_page = HarBuilder.page(id=next_id)
104+
105+
# Link the page to the root trace
106+
new_page["_parent_id"] = har["log"]["_span_id"]
107+
72108
har["log"]["pages"].append(new_page)
73109

74110
# print a list of the pages with their title and a list of the entries, and their page ref
@@ -90,7 +126,10 @@ def get_or_create_current_page(self):
90126
if len(self.har["log"]["pages"]) > 0:
91127
return self.har["log"]["pages"][-1]
92128
else:
93-
har_page = HarBuilder.page
129+
har_page = HarBuilder.page()
130+
# Link to the root span
131+
if "_span_id" in self.har["log"]:
132+
har_page["_parent_id"] = self.har["log"]["_span_id"]
94133
self.har["log"]["pages"].append(har_page)
95134
return har_page
96135

@@ -99,7 +138,7 @@ def reset_har_and_return_old_har(self):
99138

100139
with mutex:
101140
old_har = self.end_har()
102-
self.har = HarBuilder.har()
141+
self.new_har()
103142

104143
return old_har
105144

mitmproxy/addons/browserup/har/har_schemas.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,18 @@ class PageTimingSchema(Schema):
9595
"description": "Top level href, including hashtag, etc per the browser",
9696
}
9797
)
98+
_span_id = fields.Str(
99+
metadata={
100+
"optional": True,
101+
"description": "W3C Trace Context span ID for this page",
102+
}
103+
)
104+
_parent_id = fields.Str(
105+
metadata={
106+
"optional": True,
107+
"description": "W3C Trace Context parent span ID (typically the HAR log span ID)",
108+
}
109+
)
98110

99111

100112
class MatchCriteriaSchema(Schema):

mitmproxy/addons/browserup/har_capture_addon.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22

33
import mitmproxy.http
4+
from mitmproxy import ctx
45
from mitmproxy.addons.browserup.har import flow_har_entry_patch
56
from mitmproxy.addons.browserup.har.flow_capture import FlowCaptureMixin
67
from mitmproxy.addons.browserup.har.har_manager import HarManagerMixin
@@ -22,6 +23,7 @@ class HarCaptureAddOn(FlowCaptureMixin, HarManagerMixin):
2223
def load(self, loader):
2324
logging.info("Loading HarCaptureAddon")
2425
loader.add_option("harcapture", str, "", "HAR capture path.")
26+
loader.add_option("trace_enabled", bool, True, "Enable W3C distributed tracing headers")
2527

2628
# Resources are used to define items available over the API.
2729
def get_resources(self):
@@ -46,7 +48,13 @@ def websocket_message(self, flow: mitmproxy.http.HTTPFlow):
4648
def request(self, flow: mitmproxy.http.HTTPFlow):
4749
if "blocklisted" in flow.metadata:
4850
return
51+
52+
# First create the HAR entry with trace information
4953
self.capture_request(flow)
54+
55+
# Then decorate the request with trace headers if enabled
56+
if hasattr(ctx.options, 'trace_enabled') and ctx.options.trace_enabled:
57+
self.decorate_request_with_trace_headers(flow)
5058

5159
def response(self, flow: mitmproxy.http.HTTPFlow):
5260
if "blocklisted" in flow.metadata:

0 commit comments

Comments
 (0)