@@ -99,29 +99,40 @@ async def search_decisions(
9999
100100 query = " " .join (search_terms )
101101
102- # TODO: Week 1, Day 1-2
103- # 1. Build search URL with params
104- # 2. Fetch search results page
105- # 3. Parse HTML to extract case listings
106- # 4. Return list of basic case info
107-
108- # PLACEHOLDER - Replace with real implementation
109- results = []
102+ # Build the document search URL (based on your findings!)
103+ base_url = "https://www.fwc.gov.au/document-search"
104+ search_url = f"{ base_url } ?q={ quote_plus (query )} &options=SearchType_1%2CSortOrder_decision-relevance"
105+
106+ print (f" URL: { search_url } " )
107+
108+ try :
109+ # Create session if needed
110+ if not self .session :
111+ self .session = aiohttp .ClientSession ()
112+
113+ # Fetch the search results page
114+ async with self .session .get (search_url ) as response :
115+ if response .status != 200 :
116+ print (f" ✗ HTTP { response .status } " )
117+ return []
118+
119+ html = await response .text ()
120+
121+ # Parse the results using our parser
122+ results = self ._parse_search_results (html )
123+
124+ print (f" ✓ Found { len (results )} cases" )
125+
126+ return results [:max_results ]
127+
128+ except Exception as e :
129+ print (f" ✗ Error: { e } " )
130+ import traceback
131+ print (" Full traceback:" )
132+ traceback .print_exc ()
133+ return []
110134
111- # Simulate finding cases (remove this when implementing)
112- print (f" ⚠️ FWC search not yet implemented - returning placeholder" )
113- results = [
114- {
115- "case_id" : "U2024/EXAMPLE" ,
116- "date" : "2024-01-15" ,
117- "title" : f"Example Case - { company_name } " ,
118- "url" : f"{ self .BASE_URL } /decision/example" ,
119- "confidence" : 0.8 ,
120- "source" : "FWC"
121- }
122- ]
123-
124- return results [:max_results ]
135+
125136
126137 async def fetch_decision_details (self , decision_url : str ) -> Dict :
127138 """
@@ -169,24 +180,145 @@ def _parse_search_results(self, html: str) -> List[Dict]:
169180 Returns:
170181 List of case dictionaries
171182 """
172- # TODO: Week 1, Day 2
173- # Use BeautifulSoup to parse search results
174- # Extract: case number, date, title, link
175-
176183 soup = BeautifulSoup (html , 'html.parser' )
177184 results = []
178-
179- # Find result elements (inspect FWC website to get correct selectors)
180- # Example structure (this will vary based on actual FWC HTML):
181- # for result in soup.select('.search-result'):
182- # case_id = result.select_one('.case-number').text
183- # title = result.select_one('.title').text
184- # date = result.select_one('.date').text
185- # link = result.select_one('a')['href']
186- # results.append({...})
185+
186+ # Find all result items
187+ # Based on your findings: <a class="flex-grow" href="..."><h3>Title</h3></a>
188+ result_links = soup .select ('a.flex-grow' )
189+
190+ print (f" Found { len (result_links )} result links in HTML" )
191+
192+ for link in result_links :
193+ try :
194+ print (f" DEBUG: Processing link..." )
195+ print (f" DEBUG: Link HTML = { link } " )
196+
197+ # Extract title from <h3>
198+ title_elem = link .select_one ('h3' )
199+ print (f" DEBUG: title_elem = { title_elem } " )
200+ if not title_elem :
201+ print (f" DEBUG: No <h3> found, skipping" )
202+ continue
203+
204+ title = title_elem .text .strip ()
205+ print (f" DEBUG: title = { title } " )
206+
207+ # Extract URL
208+ href = link .get ('href' , '' )
209+ print (f" DEBUG: href = { href } " )
210+ if not href :
211+ continue
212+
213+ # Make absolute URL if relative
214+ if href .startswith ('/' ):
215+ url = f"https://www.fwc.gov.au{ href } "
216+ else :
217+ url = href
218+
219+ # Extract case ID from title
220+ case_id = self ._extract_case_id_from_text (title )
221+ if not case_id :
222+ case_id = "UNKNOWN"
223+
224+ # Try to find date - it's usually in a nearby sibling or parent
225+ # We'll look for date patterns in the surrounding HTML
226+ date = self ._extract_date_near_element (link )
227+
228+ # Build result
229+ result = {
230+ 'case_id' : case_id ,
231+ 'title' : title ,
232+ 'url' : url ,
233+ 'date' : date ,
234+ 'source' : 'FWC' ,
235+ 'confidence' : 0.85 # High confidence for direct matches
236+ }
237+
238+ results .append (result )
239+
240+ except Exception as e :
241+ # Don't let one bad result break everything
242+ print (f" ⚠️ Error parsing result: { e } " )
243+ import traceback
244+ traceback .print_exc () # Show full error details
245+ continue
187246
188247 return results
189248
249+
250+ def _extract_case_id_from_text (self , text : str ) -> Optional [str ]:
251+ """
252+ Extract case ID from text.
253+
254+ Handles patterns like:
255+ - U2024/12345 (unfair dismissal)
256+ - AG2020/503 (agreement)
257+ - C2015/7667 (general protections)
258+ - AE423670 (order reference)
259+ """
260+ # Pattern 1: Letter(s) + Year + Slash + Number
261+ # Examples: U2024/12345, AG2020/503, C2015/7667
262+ match = re .search (r'\b([A-Z]+\d{4}/\d+)\b' , text )
263+ if match :
264+ return match .group (1 )
265+
266+ # Pattern 2: Two Letters + 6 Digits
267+ # Examples: AE423670, PR717347
268+ match = re .search (r'\b([A-Z]{2}\d{6})\b' , text )
269+ if match :
270+ return match .group (1 )
271+
272+ # Pattern 3: Just look for any plausible case number
273+ match = re .search (r'\b([A-Z]{1,4}\d{4,7})\b' , text )
274+ if match :
275+ return match .group (1 )
276+
277+ return None
278+
279+ def _extract_date_near_element (self , element ) -> Optional [str ]:
280+ """
281+ Try to find a date near the given element.
282+ Returns ISO format (YYYY-MM-DD) if found.
283+ """
284+ try :
285+ # Get parent container
286+ parent = element .find_parent ()
287+ if not parent :
288+ return None
289+
290+ # Look for date patterns in text
291+ text = parent .get_text ()
292+ if not text : # Safety check
293+ return None
294+
295+ # Pattern: DD Month YYYY (e.g., "18 March 2020")
296+ month_names = '|' .join ([
297+ 'January' , 'February' , 'March' , 'April' , 'May' , 'June' ,
298+ 'July' , 'August' , 'September' , 'October' , 'November' , 'December'
299+ ])
300+ pattern = rf'(\d{{1,2}}) ({ month_names } ) (\d{{4}})'
301+
302+ match = re .search (pattern , text )
303+ if match :
304+ day , month_name , year = match .groups ()
305+ try :
306+ date_obj = datetime .strptime (f"{ day } { month_name } { year } " , "%d %B %Y" )
307+ return date_obj .strftime ("%Y-%m-%d" )
308+ except :
309+ pass
310+
311+ # Pattern: YYYY-MM-DD (already in ISO format)
312+ match = re .search (r'(\d{4}-\d{2}-\d{2})' , text )
313+ if match :
314+ return match .group (1 )
315+
316+ return None
317+
318+ except Exception as e :
319+ # Date parsing failed, that's okay
320+ return None
321+
190322 def _parse_decision (self , html : str , decision_url : str ) -> Dict :
191323 """
192324 Parse a full FWC decision document.
0 commit comments