Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 23 additions & 21 deletions hestia/hestia_utils/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def address(self) -> str:
@address.setter
def address(self, address: str) -> None:
self._address = address

@property
def city(self) -> str:
return self._parsed_city
Expand Down Expand Up @@ -430,17 +430,30 @@ def parse_pararius(self, r: requests.models.Response):
continue
if re.search(r"^[0-9]", address_raw): # Filter "1e Foobarstraat 5", etc.
continue
if not re.search(r"[0-9]", address_raw):

price_text = price_main.get_text(" ", strip=True).replace("\xa0", " ")
# If unable to cast to int, the price is not available so skip the listing
try:
m = re.search(r"(\d[\d\.,]*)", price_text)
if not m:
continue
home.price = int(m.group(1).replace(".", "").replace(",", ""))
except Exception:
continue

# Most Pararius titles are prefixed with a property type; strip when present.
parts = address_raw.split()
if parts and parts[0].lower() in {"appartement", "huis", "studio", "kamer", "woning", "woonhuis", "flat", "house", "room", "apartment"}:
address = " ".join(parts[1:]).strip()
else:
address = address_raw.strip()
ignored = {"appartement", "huis", "studio", "kamer", "woning", "woonhuis", "flat", "house", "room", "apartment"}
address = address_raw.strip()
first, _, address_rest = address.partition(" ")
if first.lower() in ignored:
address = address_rest

if not re.search(r"[0-9]", address):
continue
# Quite a lot of listings on Pararius don't include the house number
# Instead of skipping them, just rely on the amount of rent
# This will still distinguish most houses in the database
address += f" [€{home.price}]"

home.address = address

city_raw = subtitle.get_text(" ", strip=True)
Expand All @@ -453,17 +466,6 @@ def parse_pararius(self, r: requests.models.Response):
continue
home.url = ("https://www.pararius.com" + href) if href.startswith("/") else href

price_text = price_main.get_text(" ", strip=True).replace("\xa0", " ")

# If unable to cast to int, the price is not available so skip the listing
try:
m = re.search(r"(\d[\d\.,]*)", price_text)
if not m:
continue
home.price = int(m.group(1).replace(".", "").replace(",", ""))
except Exception:
continue

sqm_el = res.select_one(".illustrated-features__item--surface-area")
if sqm_el:
sqm_match = re.search(r"(\d+)", sqm_el.get_text(" ", strip=True))
Expand All @@ -484,7 +486,7 @@ def parse_funda(self, r: requests.models.Response):
continue

home = Home(agency="funda")

home.address = f"{res['_source']['address']['street_name']} {res['_source']['address']['house_number']}"
if "house_number_suffix" in res["_source"]["address"].keys():
suffix = res["_source"]["address"]["house_number_suffix"]
Expand Down
3 changes: 2 additions & 1 deletion tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,8 @@ def test_filters_no_house_number(self, mock_response):
html = f"<html>{self._make_listing_html(address='Appartement Kerkstraat')}</html>"
r = mock_response(html)
results = HomeResults("pararius", r)
assert len(results.homes) == 0
assert len(results.homes) == 1
assert results[0].address == "Kerkstraat [€1500]" # < improvised number!

def test_filters_address_starting_with_number(self, mock_response):
"""Addresses where the raw content starts with a digit (no type prefix) are filtered."""
Expand Down
Loading