Week 1 Day 1: Foundation + FWC API discovery

myshkin42 · myshkin42 · commit 93b2169831a4 · 2025-10-21T22:04:02.000+01:00
diff --git a/find_selector.py b/find_selector.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+from bs4 import BeautifulSoup
+
+# Load the saved HTML
+fixture = Path("tests/fixtures/fwc_search_qantas.html")
+if not fixture.exists():
+    print("❌ Fixture not found!")
+    print("Save the FWC search page first:")
+    print("1. Go to: https://www.fwc.gov.au/document-search?q=qantas")
+    print("2. Right-click → Save As → HTML only")
+    print(f"3. Save to: {fixture.absolute()}")
+    exit(1)
+
+html = fixture.read_text(encoding='utf-8')
+soup = BeautifulSoup(html, 'html.parser')
+
+print("Looking for search result elements...\n")
+
+# Find all links (case results should be links)
+all_links = soup.find_all('a', href=True)
+print(f"Total <a> tags found: {len(all_links)}\n")
+
+# Look for links that might be case results
+# They usually contain case titles and go to decision pages
+case_links = []
+for link in all_links:
+    href = link.get('href', '')
+    text = link.get_text(strip=True)
+    
+    # Case results usually:
+    # - Have text content
+    # - Link to /document-search/view/ or similar
+    # - Contain company names or case info
+    if text and len(text) > 10:
+        if 'qantas' in text.lower() or 'document-search' in href:
+            case_links.append(link)
+
+print(f"Potential case result links: {len(case_links)}\n")
+
+# Show the first few
+for i, link in enumerate(case_links[:3], 1):
+    print(f"Result {i}:")
+    print(f"  Text: {link.get_text(strip=True)[:80]}...")
+    print(f"  Href: {link.get('href', '')[:80]}...")
+    print(f"  Classes: {link.get('class', [])}")
+    print(f"  HTML preview:")
+    print(f"    {str(link)[:200]}...")
+    print()
+
+# Show what element contains the case info
+if case_links:
+    first_result = case_links[0]
+    parent = first_result.find_parent()
+    print("\nParent element:")
+    print(f"  Tag: {parent.name}")
+    print(f"  Classes: {parent.get('class', [])}")
diff --git a/test_api.py b/test_api.py
@@ -0,0 +1,58 @@
+import asyncio
+import aiohttp
+
+async def test_fwc_api():
+    url = "https://www.fwc.gov.au/document-search/searchview"
+    
+    # Proper headers (mimic browser)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Origin': 'https://www.fwc.gov.au',
+        'Referer': 'https://www.fwc.gov.au/document-search?q=qantas',
+    }
+    
+    # Form data (URL encoded)
+    data = {
+        'q': 'qantas',
+        'pageSize': '10',
+        'currentPage': '1',
+        'extraSearchOptions[0][key]': 'SearchType',
+        'extraSearchOptions[0][value]': '1',
+        'extraSearchOptions[1][key]': 'SortOrder',
+        'extraSearchOptions[1][value]': 'decision-relevance',
+    }
+    
+    print("Making request to FWC API...")
+    print(f"URL: {url}")
+    print(f"Data: {data}")
+    print()
+    
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, data=data, headers=headers) as response:
+            print(f"Status: {response.status}")
+            print(f"Content-Type: {response.headers.get('Content-Type')}")
+            
+            if response.status == 200:
+                text = await response.text()
+                print(f"\nResponse length: {len(text)} chars")
+                print(f"\nFirst 1000 chars:")
+                print(text[:1000])
+                
+                # Try to parse as JSON
+                try:
+                    json_data = await response.json(content_type=None)
+                    print(f"\n✓ Response is JSON!")
+                    print(f"Keys: {list(json_data.keys())}")
+                    if 'results' in json_data:
+                        print(f"Number of results: {len(json_data['results'])}")
+                except:
+                    print(f"\n⚠️ Response is not JSON")
+            else:
+                error_text = await response.text()
+                print(f"\n✗ Error response:")
+                print(error_text[:500])
+
+asyncio.run(test_fwc_api())
diff --git a/tools/fwc.py b/tools/fwc.py
@@ -99,29 +99,40 @@ async def search_decisions(
         
         query = " ".join(search_terms)
         
-        # TODO: Week 1, Day 1-2
-        # 1. Build search URL with params
-        # 2. Fetch search results page
-        # 3. Parse HTML to extract case listings
-        # 4. Return list of basic case info
-        
-        # PLACEHOLDER - Replace with real implementation
-        results = []
+        # Build the document search URL (based on your findings!)
+        base_url = "https://www.fwc.gov.au/document-search"
+        search_url = f"{base_url}?q={quote_plus(query)}&options=SearchType_1%2CSortOrder_decision-relevance"
+
+        print(f"   URL: {search_url}")
+
+        try:
+            # Create session if needed
+            if not self.session:
+                self.session = aiohttp.ClientSession()
+            
+            # Fetch the search results page
+            async with self.session.get(search_url) as response:
+                if response.status != 200:
+                    print(f"   ✗ HTTP {response.status}")
+                    return []
+                
+                html = await response.text()
+            
+            # Parse the results using our parser
+            results = self._parse_search_results(html)
+            
+            print(f"   ✓ Found {len(results)} cases")
+            
+            return results[:max_results]
+            
+        except Exception as e:
+            print(f"   ✗ Error: {e}")
+            import traceback
+            print("   Full traceback:")
+            traceback.print_exc()
+            return []
         
-        # Simulate finding cases (remove this when implementing)
-        print(f"   ⚠️  FWC search not yet implemented - returning placeholder")
-        results = [
-            {
-                "case_id": "U2024/EXAMPLE",
-                "date": "2024-01-15",
-                "title": f"Example Case - {company_name}",
-                "url": f"{self.BASE_URL}/decision/example",
-                "confidence": 0.8,
-                "source": "FWC"
-            }
-        ]
-        
-        return results[:max_results]
+
     
     async def fetch_decision_details(self, decision_url: str) -> Dict:
         """
@@ -169,24 +180,145 @@ def _parse_search_results(self, html: str) -> List[Dict]:
         Returns:
             List of case dictionaries
         """
-        # TODO: Week 1, Day 2
-        # Use BeautifulSoup to parse search results
-        # Extract: case number, date, title, link
-        
         soup = BeautifulSoup(html, 'html.parser')
         results = []
-        
-        # Find result elements (inspect FWC website to get correct selectors)
-        # Example structure (this will vary based on actual FWC HTML):
-        # for result in soup.select('.search-result'):
-        #     case_id = result.select_one('.case-number').text
-        #     title = result.select_one('.title').text
-        #     date = result.select_one('.date').text
-        #     link = result.select_one('a')['href']
-        #     results.append({...})
+    
+        # Find all result items
+        # Based on your findings: <a class="flex-grow" href="..."><h3>Title</h3></a>
+        result_links = soup.select('a.flex-grow')
+        
+        print(f"   Found {len(result_links)} result links in HTML")
+        
+        for link in result_links:
+            try:
+                print(f"   DEBUG: Processing link...")
+                print(f"   DEBUG: Link HTML = {link}")
+                
+                # Extract title from <h3>
+                title_elem = link.select_one('h3')
+                print(f"   DEBUG: title_elem = {title_elem}")
+                if not title_elem:
+                    print(f"   DEBUG: No <h3> found, skipping")
+                    continue
+                
+                title = title_elem.text.strip()
+                print(f"   DEBUG: title = {title}")
+
+                # Extract URL
+                href = link.get('href', '')
+                print(f"   DEBUG: href = {href}")
+                if not href:
+                    continue
+                
+                # Make absolute URL if relative
+                if href.startswith('/'):
+                    url = f"https://www.fwc.gov.au{href}"
+                else:
+                    url = href
+                
+                # Extract case ID from title
+                case_id = self._extract_case_id_from_text(title)
+                if not case_id:
+                    case_id = "UNKNOWN"
+                
+                # Try to find date - it's usually in a nearby sibling or parent
+                # We'll look for date patterns in the surrounding HTML
+                date = self._extract_date_near_element(link)
+                
+                # Build result
+                result = {
+                    'case_id': case_id,
+                    'title': title,
+                    'url': url,
+                    'date': date,
+                    'source': 'FWC',
+                    'confidence': 0.85  # High confidence for direct matches
+                }
+                
+                results.append(result)
+                
+            except Exception as e:
+                # Don't let one bad result break everything
+                print(f"   ⚠️  Error parsing result: {e}")
+                import traceback
+                traceback.print_exc()  # Show full error details
+                continue
         
         return results
     
+
+    def _extract_case_id_from_text(self, text: str) -> Optional[str]:
+        """
+        Extract case ID from text.
+        
+        Handles patterns like:
+        - U2024/12345 (unfair dismissal)
+        - AG2020/503 (agreement)
+        - C2015/7667 (general protections)
+        - AE423670 (order reference)
+        """
+        # Pattern 1: Letter(s) + Year + Slash + Number
+        # Examples: U2024/12345, AG2020/503, C2015/7667
+        match = re.search(r'\b([A-Z]+\d{4}/\d+)\b', text)
+        if match:
+            return match.group(1)
+        
+        # Pattern 2: Two Letters + 6 Digits
+        # Examples: AE423670, PR717347
+        match = re.search(r'\b([A-Z]{2}\d{6})\b', text)
+        if match:
+            return match.group(1)
+        
+        # Pattern 3: Just look for any plausible case number
+        match = re.search(r'\b([A-Z]{1,4}\d{4,7})\b', text)
+        if match:
+            return match.group(1)
+        
+        return None
+
+    def _extract_date_near_element(self, element) -> Optional[str]:
+        """
+        Try to find a date near the given element.
+        Returns ISO format (YYYY-MM-DD) if found.
+        """
+        try:
+            # Get parent container
+            parent = element.find_parent()
+            if not parent:
+                return None
+            
+            # Look for date patterns in text
+            text = parent.get_text()
+            if not text:  # Safety check
+                return None
+            
+            # Pattern: DD Month YYYY (e.g., "18 March 2020")
+            month_names = '|'.join([
+                'January', 'February', 'March', 'April', 'May', 'June',
+                'July', 'August', 'September', 'October', 'November', 'December'
+            ])
+            pattern = rf'(\d{{1,2}}) ({month_names}) (\d{{4}})'
+            
+            match = re.search(pattern, text)
+            if match:
+                day, month_name, year = match.groups()
+                try:
+                    date_obj = datetime.strptime(f"{day} {month_name} {year}", "%d %B %Y")
+                    return date_obj.strftime("%Y-%m-%d")
+                except:
+                    pass
+            
+            # Pattern: YYYY-MM-DD (already in ISO format)
+            match = re.search(r'(\d{4}-\d{2}-\d{2})', text)
+            if match:
+                return match.group(1)
+            
+            return None
+            
+        except Exception as e:
+            # Date parsing failed, that's okay
+            return None
+    
     def _parse_decision(self, html: str, decision_url: str) -> Dict:
         """
         Parse a full FWC decision document.