4848
4949
5050class FlowCaptureMixin (object ):
51+ def is_browser_user_agent (self , user_agent ):
52+ """Check if User-Agent looks like a browser based on the specified regex pattern"""
53+ if not user_agent :
54+ return False
55+
56+ import re
57+ pattern = re .compile (r"Mozilla|Chrome|Firefox|Safari|Edge|Gecko|WebKit|Ladybird|Lightpanda" , re .IGNORECASE )
58+ return bool (pattern .search (user_agent ))
59+
60+ def is_first_request_in_page (self , flow ):
61+ """Check if this is the first request for the current page"""
62+ page = self .get_or_create_current_page ()
63+ page_entries = self .entries_for_page (page ["id" ])
64+
65+ # This could be the first request for this page
66+ is_first = len (page_entries ) == 0 or (
67+ len (page_entries ) == 1 and
68+ page_entries [0 ]["request" ]["url" ] == self .get_full_url (flow .request )
69+ )
70+
71+ return is_first
72+
73+ def is_html_content_type (self , flow ):
74+ """Check if response content type is HTML"""
75+ content_type = flow .response .headers .get ("Content-Type" , "" ) if flow .response else ""
76+ return "text/html" in content_type .lower ()
77+
78+ def decorate_request_with_trace_headers (self , flow ):
79+ """Add or update W3C trace context headers for distributed tracing"""
80+ har_entry = flow .get_har_entry ()
81+
82+ # Get the IDs from HAR entry
83+ trace_id = har_entry .get ("_trace_id" )
84+ span_id = har_entry .get ("_span_id" )
85+
86+ # Skip if we don't have proper IDs
87+ if not trace_id or not span_id :
88+ logging .debug ("No trace info available for request" )
89+ return
90+
91+ # Check if this is the first request in a page and has a browser user-agent
92+ if self .is_first_request_in_page (flow ):
93+ user_agent = flow .request .headers .get ("User-Agent" , "" )
94+ is_browser = self .is_browser_user_agent (user_agent )
95+
96+ if is_browser :
97+ # Mark as potential browser root - will be confirmed when response comes in
98+ har_entry ["_potential_browser_root" ] = True
99+ logging .debug (f"Potential browser root request detected: { flow .request .url } " )
100+
101+ # Check if traceparent header already exists
102+ existing_traceparent = flow .request .headers .get ("traceparent" )
103+ if existing_traceparent :
104+ # Parse the existing traceparent header
105+ parts = existing_traceparent .split ("-" )
106+ if len (parts ) == 4 :
107+ # Extract trace ID and flags from the existing header
108+ existing_trace_id = parts [1 ]
109+ trace_flags = parts [3 ]
110+
111+ # Update the HAR entry with the existing trace ID
112+ har_entry ["_trace_id" ] = existing_trace_id
113+
114+ # Use the existing trace ID, but with the HAR entry's span ID
115+ traceparent = f"{ parts [0 ]} -{ existing_trace_id } -{ span_id } -{ trace_flags } "
116+ flow .request .headers ["traceparent" ] = traceparent
117+ logging .debug (f"Updated existing traceparent header: { traceparent } " )
118+ else :
119+ # Invalid format, create a new traceparent
120+ traceparent = f"00-{ trace_id } -{ span_id } -01"
121+ flow .request .headers ["traceparent" ] = traceparent
122+ logging .debug (f"Replaced invalid traceparent header: { traceparent } " )
123+ else :
124+ # No existing header, create a new one
125+ # Format: version-traceid-spanid-traceflags
126+ # Using version 00 and traceflags 01 (sampled)
127+ traceparent = f"00-{ trace_id } -{ span_id } -01"
128+ flow .request .headers ["traceparent" ] = traceparent
129+ logging .debug (f"Added traceparent header: { traceparent } " )
130+
131+ # Handle tracestate according to W3C spec
132+ # The spec says new vendors should add their entry to the left of any existing entries
133+ vendor_name = "browserup"
134+
135+ # Get existing tracestate
136+ existing_tracestate = flow .request .headers .get ("tracestate" , "" )
137+
138+ # Add our entry to tracestate
139+ # Use span_id as our vendor-specific value
140+ new_entry = f"{ vendor_name } ={ span_id } "
141+
142+ if existing_tracestate :
143+ # Parse existing entries, remove our vendor if it exists already
144+ entries = [entry .strip () for entry in existing_tracestate .split (',' )]
145+ entries = [entry for entry in entries if not entry .startswith (f"{ vendor_name } =" )]
146+
147+ # Add our entry to the left (at the beginning)
148+ entries .insert (0 , new_entry )
149+
150+ # Join with commas and update the header
151+ # Limit to 32 entries per spec (the newest 32)
152+ updated_tracestate = ',' .join (entries [:32 ])
153+ flow .request .headers ["tracestate" ] = updated_tracestate
154+ logging .debug (f"Updated tracestate header: { updated_tracestate } " )
155+ else :
156+ # No existing tracestate, just add ours
157+ flow .request .headers ["tracestate" ] = new_entry
158+ logging .debug (f"Added tracestate header: { new_entry } " )
159+
51160 def capture_request (self , flow ):
52161 full_url = self .get_full_url (flow .request )
53162 if "BrowserUpData" in full_url or "detectportal.firefox.com" in full_url :
@@ -56,6 +165,7 @@ def capture_request(self, flow):
56165
57166 logging .info ("Populating har entry for request: {}" .format (full_url ))
58167
168+ # First get the HAR entry - this creates it with the trace info
59169 har_entry = flow .get_har_entry ()
60170
61171 har_entry ["startedDateTime" ] = datetime .fromtimestamp (
@@ -187,6 +297,32 @@ def capture_response(self, flow):
187297 content ["size" ] = response_body_size
188298 content ["compression" ] = response_body_compression
189299 content ["mimeType" ] = flow .response .headers .get ("Content-Type" , "" )
300+
301+ # Check if this is a confirmed browser-based HTML response
302+ content_type = flow .response .headers .get ("Content-Type" , "" ).lower ()
303+ is_html = "text/html" in content_type
304+
305+ # If this was marked as a potential browser root request and it's HTML,
306+ # confirm it as a browser root for parent/child relationships
307+ if har_entry .get ("_potential_browser_root" ) and is_html :
308+ har_entry ["_browser_root" ] = True
309+ har_entry .pop ("_potential_browser_root" , None )
310+ logging .info (f"Confirmed browser root request: { flow .request .url } " )
311+
312+ # Set parent to page span id since this is main HTML request
313+ page = self .get_or_create_current_page ()
314+ har_entry ["_parent_id" ] = page ["_span_id" ]
315+ elif not har_entry .get ("_browser_root" ):
316+ # For non-root requests, check if there's a browser root in this page to set as parent
317+ page = self .get_or_create_current_page ()
318+ page_entries = self .entries_for_page (page ["id" ])
319+
320+ for entry in page_entries :
321+ if entry .get ("_browser_root" ) and entry ["_span_id" ] != har_entry ["_span_id" ]:
322+ # Set parent to browser root span instead of page span
323+ har_entry ["_parent_id" ] = entry ["_span_id" ]
324+ logging .debug (f"Set parent of { flow .request .url } to browser root" )
325+ break
190326
191327 if HarCaptureTypes .RESPONSE_DYNAMIC_CONTENT in self .har_capture_types :
192328 mime_type = (
0 commit comments