Skip to content

Commit 93b2169

Browse files
committed
Week 1 Day 1: Foundation + FWC API discovery
1 parent 7e224af commit 93b2169

3 files changed

Lines changed: 281 additions & 35 deletions

File tree

find_selector.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from pathlib import Path
2+
from bs4 import BeautifulSoup
3+
4+
# Load the saved HTML
5+
fixture = Path("tests/fixtures/fwc_search_qantas.html")
6+
if not fixture.exists():
7+
print("❌ Fixture not found!")
8+
print("Save the FWC search page first:")
9+
print("1. Go to: https://www.fwc.gov.au/document-search?q=qantas")
10+
print("2. Right-click → Save As → HTML only")
11+
print(f"3. Save to: {fixture.absolute()}")
12+
exit(1)
13+
14+
html = fixture.read_text(encoding='utf-8')
15+
soup = BeautifulSoup(html, 'html.parser')
16+
17+
print("Looking for search result elements...\n")
18+
19+
# Find all links (case results should be links)
20+
all_links = soup.find_all('a', href=True)
21+
print(f"Total <a> tags found: {len(all_links)}\n")
22+
23+
# Look for links that might be case results
24+
# They usually contain case titles and go to decision pages
25+
case_links = []
26+
for link in all_links:
27+
href = link.get('href', '')
28+
text = link.get_text(strip=True)
29+
30+
# Case results usually:
31+
# - Have text content
32+
# - Link to /document-search/view/ or similar
33+
# - Contain company names or case info
34+
if text and len(text) > 10:
35+
if 'qantas' in text.lower() or 'document-search' in href:
36+
case_links.append(link)
37+
38+
print(f"Potential case result links: {len(case_links)}\n")
39+
40+
# Show the first few
41+
for i, link in enumerate(case_links[:3], 1):
42+
print(f"Result {i}:")
43+
print(f" Text: {link.get_text(strip=True)[:80]}...")
44+
print(f" Href: {link.get('href', '')[:80]}...")
45+
print(f" Classes: {link.get('class', [])}")
46+
print(f" HTML preview:")
47+
print(f" {str(link)[:200]}...")
48+
print()
49+
50+
# Show what element contains the case info
51+
if case_links:
52+
first_result = case_links[0]
53+
parent = first_result.find_parent()
54+
print("\nParent element:")
55+
print(f" Tag: {parent.name}")
56+
print(f" Classes: {parent.get('class', [])}")

test_api.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import asyncio
2+
import aiohttp
3+
4+
async def test_fwc_api():
5+
url = "https://www.fwc.gov.au/document-search/searchview"
6+
7+
# Proper headers (mimic browser)
8+
headers = {
9+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
10+
'Accept': 'application/json, text/plain, */*',
11+
'Accept-Language': 'en-US,en;q=0.9',
12+
'Content-Type': 'application/x-www-form-urlencoded',
13+
'Origin': 'https://www.fwc.gov.au',
14+
'Referer': 'https://www.fwc.gov.au/document-search?q=qantas',
15+
}
16+
17+
# Form data (URL encoded)
18+
data = {
19+
'q': 'qantas',
20+
'pageSize': '10',
21+
'currentPage': '1',
22+
'extraSearchOptions[0][key]': 'SearchType',
23+
'extraSearchOptions[0][value]': '1',
24+
'extraSearchOptions[1][key]': 'SortOrder',
25+
'extraSearchOptions[1][value]': 'decision-relevance',
26+
}
27+
28+
print("Making request to FWC API...")
29+
print(f"URL: {url}")
30+
print(f"Data: {data}")
31+
print()
32+
33+
async with aiohttp.ClientSession() as session:
34+
async with session.post(url, data=data, headers=headers) as response:
35+
print(f"Status: {response.status}")
36+
print(f"Content-Type: {response.headers.get('Content-Type')}")
37+
38+
if response.status == 200:
39+
text = await response.text()
40+
print(f"\nResponse length: {len(text)} chars")
41+
print(f"\nFirst 1000 chars:")
42+
print(text[:1000])
43+
44+
# Try to parse as JSON
45+
try:
46+
json_data = await response.json(content_type=None)
47+
print(f"\n✓ Response is JSON!")
48+
print(f"Keys: {list(json_data.keys())}")
49+
if 'results' in json_data:
50+
print(f"Number of results: {len(json_data['results'])}")
51+
except:
52+
print(f"\n⚠️ Response is not JSON")
53+
else:
54+
error_text = await response.text()
55+
print(f"\n✗ Error response:")
56+
print(error_text[:500])
57+
58+
asyncio.run(test_fwc_api())

tools/fwc.py

Lines changed: 167 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -99,29 +99,40 @@ async def search_decisions(
9999

100100
query = " ".join(search_terms)
101101

102-
# TODO: Week 1, Day 1-2
103-
# 1. Build search URL with params
104-
# 2. Fetch search results page
105-
# 3. Parse HTML to extract case listings
106-
# 4. Return list of basic case info
107-
108-
# PLACEHOLDER - Replace with real implementation
109-
results = []
102+
# Build the document search URL (based on your findings!)
103+
base_url = "https://www.fwc.gov.au/document-search"
104+
search_url = f"{base_url}?q={quote_plus(query)}&options=SearchType_1%2CSortOrder_decision-relevance"
105+
106+
print(f" URL: {search_url}")
107+
108+
try:
109+
# Create session if needed
110+
if not self.session:
111+
self.session = aiohttp.ClientSession()
112+
113+
# Fetch the search results page
114+
async with self.session.get(search_url) as response:
115+
if response.status != 200:
116+
print(f" ✗ HTTP {response.status}")
117+
return []
118+
119+
html = await response.text()
120+
121+
# Parse the results using our parser
122+
results = self._parse_search_results(html)
123+
124+
print(f" ✓ Found {len(results)} cases")
125+
126+
return results[:max_results]
127+
128+
except Exception as e:
129+
print(f" ✗ Error: {e}")
130+
import traceback
131+
print(" Full traceback:")
132+
traceback.print_exc()
133+
return []
110134

111-
# Simulate finding cases (remove this when implementing)
112-
print(f" ⚠️ FWC search not yet implemented - returning placeholder")
113-
results = [
114-
{
115-
"case_id": "U2024/EXAMPLE",
116-
"date": "2024-01-15",
117-
"title": f"Example Case - {company_name}",
118-
"url": f"{self.BASE_URL}/decision/example",
119-
"confidence": 0.8,
120-
"source": "FWC"
121-
}
122-
]
123-
124-
return results[:max_results]
135+
125136

126137
async def fetch_decision_details(self, decision_url: str) -> Dict:
127138
"""
@@ -169,24 +180,145 @@ def _parse_search_results(self, html: str) -> List[Dict]:
169180
Returns:
170181
List of case dictionaries
171182
"""
172-
# TODO: Week 1, Day 2
173-
# Use BeautifulSoup to parse search results
174-
# Extract: case number, date, title, link
175-
176183
soup = BeautifulSoup(html, 'html.parser')
177184
results = []
178-
179-
# Find result elements (inspect FWC website to get correct selectors)
180-
# Example structure (this will vary based on actual FWC HTML):
181-
# for result in soup.select('.search-result'):
182-
# case_id = result.select_one('.case-number').text
183-
# title = result.select_one('.title').text
184-
# date = result.select_one('.date').text
185-
# link = result.select_one('a')['href']
186-
# results.append({...})
185+
186+
# Find all result items
187+
# Based on your findings: <a class="flex-grow" href="..."><h3>Title</h3></a>
188+
result_links = soup.select('a.flex-grow')
189+
190+
print(f" Found {len(result_links)} result links in HTML")
191+
192+
for link in result_links:
193+
try:
194+
print(f" DEBUG: Processing link...")
195+
print(f" DEBUG: Link HTML = {link}")
196+
197+
# Extract title from <h3>
198+
title_elem = link.select_one('h3')
199+
print(f" DEBUG: title_elem = {title_elem}")
200+
if not title_elem:
201+
print(f" DEBUG: No <h3> found, skipping")
202+
continue
203+
204+
title = title_elem.text.strip()
205+
print(f" DEBUG: title = {title}")
206+
207+
# Extract URL
208+
href = link.get('href', '')
209+
print(f" DEBUG: href = {href}")
210+
if not href:
211+
continue
212+
213+
# Make absolute URL if relative
214+
if href.startswith('/'):
215+
url = f"https://www.fwc.gov.au{href}"
216+
else:
217+
url = href
218+
219+
# Extract case ID from title
220+
case_id = self._extract_case_id_from_text(title)
221+
if not case_id:
222+
case_id = "UNKNOWN"
223+
224+
# Try to find date - it's usually in a nearby sibling or parent
225+
# We'll look for date patterns in the surrounding HTML
226+
date = self._extract_date_near_element(link)
227+
228+
# Build result
229+
result = {
230+
'case_id': case_id,
231+
'title': title,
232+
'url': url,
233+
'date': date,
234+
'source': 'FWC',
235+
'confidence': 0.85 # High confidence for direct matches
236+
}
237+
238+
results.append(result)
239+
240+
except Exception as e:
241+
# Don't let one bad result break everything
242+
print(f" ⚠️ Error parsing result: {e}")
243+
import traceback
244+
traceback.print_exc() # Show full error details
245+
continue
187246

188247
return results
189248

249+
250+
def _extract_case_id_from_text(self, text: str) -> Optional[str]:
251+
"""
252+
Extract case ID from text.
253+
254+
Handles patterns like:
255+
- U2024/12345 (unfair dismissal)
256+
- AG2020/503 (agreement)
257+
- C2015/7667 (general protections)
258+
- AE423670 (order reference)
259+
"""
260+
# Pattern 1: Letter(s) + Year + Slash + Number
261+
# Examples: U2024/12345, AG2020/503, C2015/7667
262+
match = re.search(r'\b([A-Z]+\d{4}/\d+)\b', text)
263+
if match:
264+
return match.group(1)
265+
266+
# Pattern 2: Two Letters + 6 Digits
267+
# Examples: AE423670, PR717347
268+
match = re.search(r'\b([A-Z]{2}\d{6})\b', text)
269+
if match:
270+
return match.group(1)
271+
272+
# Pattern 3: Just look for any plausible case number
273+
match = re.search(r'\b([A-Z]{1,4}\d{4,7})\b', text)
274+
if match:
275+
return match.group(1)
276+
277+
return None
278+
279+
def _extract_date_near_element(self, element) -> Optional[str]:
280+
"""
281+
Try to find a date near the given element.
282+
Returns ISO format (YYYY-MM-DD) if found.
283+
"""
284+
try:
285+
# Get parent container
286+
parent = element.find_parent()
287+
if not parent:
288+
return None
289+
290+
# Look for date patterns in text
291+
text = parent.get_text()
292+
if not text: # Safety check
293+
return None
294+
295+
# Pattern: DD Month YYYY (e.g., "18 March 2020")
296+
month_names = '|'.join([
297+
'January', 'February', 'March', 'April', 'May', 'June',
298+
'July', 'August', 'September', 'October', 'November', 'December'
299+
])
300+
pattern = rf'(\d{{1,2}}) ({month_names}) (\d{{4}})'
301+
302+
match = re.search(pattern, text)
303+
if match:
304+
day, month_name, year = match.groups()
305+
try:
306+
date_obj = datetime.strptime(f"{day} {month_name} {year}", "%d %B %Y")
307+
return date_obj.strftime("%Y-%m-%d")
308+
except:
309+
pass
310+
311+
# Pattern: YYYY-MM-DD (already in ISO format)
312+
match = re.search(r'(\d{4}-\d{2}-\d{2})', text)
313+
if match:
314+
return match.group(1)
315+
316+
return None
317+
318+
except Exception as e:
319+
# Date parsing failed, that's okay
320+
return None
321+
190322
def _parse_decision(self, html: str, decision_url: str) -> Dict:
191323
"""
192324
Parse a full FWC decision document.

0 commit comments

Comments
 (0)