Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 32 additions & 95 deletions hestia/hestia_utils/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,63 +846,38 @@ def parse_123wonen(self, r: requests.models.Response):
self.homes.append(home)

def parse_roofz(self, r: requests.models.Response):
soup = BeautifulSoup(r.content, "html.parser")

# Find the __NUXT__ script containing all page data
nuxt_data = None
for script in soup.find_all("script"):
if script.string and 'window.__NUXT__' in script.string:
nuxt_data = script.string
break
if not nuxt_data:
return

# Build variable mapping from the Nuxt IIFE: (function(a,b,...){return {...}}(val1,val2,...))
func_start = nuxt_data.index('(function(') + len('(function(')
func_end = nuxt_data.index(')', func_start)
func_args = nuxt_data[func_start:func_end].split(',')

param_end = nuxt_data.rfind('))')
param_start = nuxt_data.rfind('}(', 0, param_end)
params = next(csv.reader([nuxt_data[param_start + 2:param_end]], skipinitialspace=True))

mapping = {}
for i in range(min(len(func_args), len(params))):
mapping[func_args[i]] = params[i]

# Find the rent array in the raw JS
rent_idx = nuxt_data.find('rent:[') + len('rent:')
bracket_count = 0
rent_end = rent_idx
for i, c in enumerate(nuxt_data[rent_idx:]):
if c == '[':
bracket_count += 1
elif c == ']':
bracket_count -= 1
if bracket_count == 0:
rent_end = rent_idx + i + 1
break

# Substitute unquoted variable references with their resolved values
# This preserves quoted strings while resolving bare identifiers
rent_resolved = self._substitute_nuxt_vars(nuxt_data[rent_idx:rent_end], mapping)
results = chompjs.parse_js_object(rent_resolved)
data = r.json()
results = data.get("data", [])

# Fetch remaining pages
meta = data.get("meta", {})
last_page = meta.get("last_page", 1)
if last_page > 1:
for page in range(2, last_page + 1):
url = r.url.split("?")[0] + f"?page={page}"
page_r = requests.get(url, headers=dict(r.request.headers))
if page_r.status_code == 200:
results.extend(page_r.json().get("data", []))

for res in results:
addr = res.get('address', {})
ho = res.get('handover', {})
status = str(res.get('status', ''))

# Filter already-rented and under-option listings
if any(x in status.lower() for x in ['rented', 'option', 'contract']):
addr = res.get("address", {})
ho = res.get("handover", {})
status = res.get("status", {})
status_code = status.get("code", "") if isinstance(status, dict) else str(status)
stage = str(res.get("stage", ""))

# Filter already-rented, under-option and occupied listings
if status_code in ("occupied", "unavailable"):
continue
if stage in ("occupied", "option"):
continue

street = addr.get('street', '')
house_num = str(addr.get('houseNumber', ''))
ext = addr.get('houseNumberExtension', '')
city = addr.get('location', '')
price = ho.get('price', 0)
slug = res.get('slug', '')
street = addr.get("street", "")
house_num = str(addr.get("house_number", ""))
ext = addr.get("house_number_addition", "")
city = addr.get("location", "")
price = ho.get("price", 0)
slug = res.get("slug", "")

if not street or not house_num or not city or not price:
continue
Expand All @@ -912,8 +887,11 @@ def parse_roofz(self, r: requests.models.Response):
if ext:
home.address += f" {ext}"
home.city = city
home.url = f"https://roofz.eu/availability/{slug}"
home.url = f"https://roofz.eu/huur/woningen/{slug}"
home.price = int(float(price))
living_area = res.get("characteristic", {}).get("living_area")
if living_area:
home.sqm = int(living_area)
self.homes.append(home)

def parse_vanderlinden(self, r: requests.models.Response):
Expand Down Expand Up @@ -1374,44 +1352,3 @@ def parse_ld_node(node):
continue

add_home(address, city, link.get("href", ""), price_match.group(1), card_text)

@staticmethod
def _substitute_nuxt_vars(js_text, mapping):
"""Replace unquoted variable references in JS text with their mapped string values.
Properly skips quoted strings to avoid corrupting literal values."""
result = []
i = 0
n = len(js_text)
while i < n:
c = js_text[i]
if c in ('"', "'"):
quote = c
j = i + 1
while j < n:
if js_text[j] == '\\' and j + 1 < n:
j += 2
elif js_text[j] == quote:
j += 1
break
else:
j += 1
result.append(js_text[i:j])
i = j
elif c.isalpha() or c in ('_', '$'):
j = i + 1
while j < n and (js_text[j].isalnum() or js_text[j] in ('_', '$')):
j += 1
ident = js_text[i:j]
if ident in ('true', 'false', 'null', 'undefined'):
result.append(ident)
elif ident in mapping:
val = mapping[ident]
val = val.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '\\r')
result.append(f'"{val}"')
else:
result.append(ident)
i = j
else:
result.append(c)
i += 1
return ''.join(result)
20 changes: 20 additions & 0 deletions misc/sql/20260313_target_roofz_api_migration.sql.enc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"data": "ENC[AES256_GCM,data:SJlfTkKErbRKW/GZIgBqbOHTFVnVy3ZoW0MBXloTp9/UccnrZhXK2XJOqR+g5jvrJD5UXc9HAIQ+6N7eosV/vyXUwDutUIdi8cT/gAub5vuR9AAp/ZnaYsOcewWvBDujB1iw9XcjkclTyhpcvSqagg7/1aj6HDTlbokuYwRX0mIPEbj9TNP+xqECxf4S7syCoWS5rO2ZQscKliUMDCWt2vPaZDUwdHG4BcNRIoBexTWyIWToX5CxIUmT35b2qagob05TOjgyvDT9ePzRbQmoGt1W0qtawVUQequYm8S5SpOYGD8=,iv:WqWuPnnloDs60v9lfHww1m75CGcem6eL68ux18dLNa4=,tag:ZZ/rmvSr+37WouhchrKf+g==,type:str]",
"sops": {
"kms": null,
"gcp_kms": null,
"azure_kv": null,
"hc_vault": null,
"age": [
{
"recipient": "age1euw2jq4h4m88g377mj39fqpxzsg6hn8gyq72adgzrdgsy9jtj3yqrhdvk5",
"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBscjVkTldTci9aV3pDYUdt\nK0tWcVJKT3lZU1NrVDJZalVnY1JDclFtVmgwCldqVTRDZE5qQzNwNGMvWFNidnBN\naFFnanl2K1k0b2hLbjdDY3h3QU11dkEKLS0tIFdVSE9tY1JUKy9NR014VjBFbWFF\nVHdjTUI0OWt0MzhNUFNjZm9ad1IxWlUKkT3GYyMfEJ9W0ANuuuphyBuH4Oalhcp2\nHy+vF0GqEq9GCZeiTGETnkt4bx9Ce08/A0p7vsuGjH7ZJNwzj2sGLA==\n-----END AGE ENCRYPTED FILE-----\n"
}
],
"lastmodified": "2026-03-13T09:16:20Z",
"mac": "ENC[AES256_GCM,data:XeTIZ7emAk7cbQXkLWvaRo9iehgpYSge+57kBJW3RSK+verlZOevXEUCeEZGp/uZXr2tdfjJ5GxUwlO3nxhfgmbUQUG5D3RuMm+uGSEfxJ1AfuhwRkrIx4czPf/LowbUhNy2R+GjHz27pE9TD9lRWMkjy3CkQqPC849/vt7Z5vE=,iv:vZwBhMf5Kkpv8IqyS3qqoDT7q83atcPvNDp0rN8oqsU=,tag:HL0zc1J+Um4uOF51U7+R4A==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.9.4"
}
}
192 changes: 59 additions & 133 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,123 +1070,86 @@ def test_with_extension_in_slug(self, mock_response):


class TestParseRoofz:
def _make_nuxt_html(self, rent_items, mapping=None):
"""Build minimal Nuxt IIFE HTML for roofz parser testing."""
if mapping is None:
mapping = {}

func_args = list(mapping.keys()) if mapping else ["a"]
params = list(mapping.values()) if mapping else ['"unused"']

func_args_str = ",".join(func_args)
params_str = ",".join(params)

rent_js = json.dumps(rent_items)

script = f"""<script>window.__NUXT__=(function({func_args_str}){{return {{rent:{rent_js}}}}})({params_str}));</script>"""
return f"<html>{script}</html>"
def _make_response(self, mock_response, listings, meta=None):
"""Build a mock JSON API response for roofz parser testing."""
payload = {"data": listings, "meta": meta or {"last_page": 1}}
r = mock_response(payload)
r.json.return_value = payload
r.url = "https://roofz.eu/api/ms/listing/properties"
return r

def _listing(self, street="Kerkstraat", house_number="10",
addition="", city="Amsterdam", price=1500,
slug="kerkstraat-10", status_code="available",
stage="available", living_area=75):
listing = {
"status": {"code": status_code, "label": status_code.title()},
"stage": stage,
"slug": slug,
"address": {
"street": street,
"house_number": house_number,
"house_number_addition": addition,
"location": city,
},
"handover": {"price": price},
}
if living_area is not None:
listing["characteristic"] = {"living_area": living_area}
return listing

def test_basic_parsing(self, mock_response):
rent_items = [
{
"status": "available",
"slug": "kerkstraat-10-amsterdam",
"address": {
"street": "Kerkstraat",
"houseNumber": 10,
"houseNumberExtension": "",
"location": "Amsterdam"
},
"handover": {"price": 1500}
}
]
html = self._make_nuxt_html(rent_items)
r = mock_response(html)
r = self._make_response(mock_response, [self._listing()])
results = HomeResults("roofz", r)
assert len(results.homes) == 1
assert results[0].address == "Kerkstraat 10"
assert results[0].city == "Amsterdam"
assert results[0].price == 1500
assert "roofz.eu/availability/" in results[0].url
assert results[0].sqm == 75
assert results[0].url == "https://roofz.eu/huur/woningen/kerkstraat-10"

def test_with_extension(self, mock_response):
rent_items = [
{
"status": "available",
"slug": "kerkstraat-10a",
"address": {
"street": "Kerkstraat",
"houseNumber": 10,
"houseNumberExtension": "A",
"location": "Amsterdam"
},
"handover": {"price": 1500}
}
]
html = self._make_nuxt_html(rent_items)
r = mock_response(html)
def test_missing_living_area(self, mock_response):
r = self._make_response(mock_response, [self._listing(living_area=None)])
results = HomeResults("roofz", r)
assert results[0].sqm == -1

def test_with_addition(self, mock_response):
r = self._make_response(mock_response, [
self._listing(addition="A", slug="kerkstraat-10a"),
])
results = HomeResults("roofz", r)
assert results[0].address == "Kerkstraat 10 A"

def test_filters_rented(self, mock_response):
rent_items = [
{
"status": "rented",
"slug": "straat-1",
"address": {
"street": "Straat",
"houseNumber": 1,
"houseNumberExtension": "",
"location": "Amsterdam"
},
"handover": {"price": 1000}
}
]
html = self._make_nuxt_html(rent_items)
r = mock_response(html)
def test_filters_occupied(self, mock_response):
r = self._make_response(mock_response, [
self._listing(status_code="occupied", stage="occupied"),
])
results = HomeResults("roofz", r)
assert len(results.homes) == 0

def test_filters_under_option(self, mock_response):
rent_items = [
{
"status": "under option",
"slug": "straat-1",
"address": {
"street": "Straat",
"houseNumber": 1,
"houseNumberExtension": "",
"location": "Amsterdam"
},
"handover": {"price": 1000}
}
]
html = self._make_nuxt_html(rent_items)
r = mock_response(html)
def test_filters_unavailable(self, mock_response):
r = self._make_response(mock_response, [
self._listing(status_code="unavailable", stage="option"),
])
results = HomeResults("roofz", r)
assert len(results.homes) == 0

def test_filters_option_stage(self, mock_response):
r = self._make_response(mock_response, [
self._listing(status_code="available", stage="option"),
])
results = HomeResults("roofz", r)
assert len(results.homes) == 0

def test_filters_missing_street(self, mock_response):
rent_items = [
{
"status": "available",
"slug": "unknown",
"address": {
"street": "",
"houseNumber": 1,
"location": "Amsterdam"
},
"handover": {"price": 1000}
}
]
html = self._make_nuxt_html(rent_items)
r = mock_response(html)
r = self._make_response(mock_response, [
self._listing(street=""),
])
results = HomeResults("roofz", r)
assert len(results.homes) == 0

def test_no_nuxt_script(self, mock_response):
html = "<html><body>No nuxt data</body></html>"
r = mock_response(html)
def test_empty_data(self, mock_response):
r = self._make_response(mock_response, [])
results = HomeResults("roofz", r)
assert len(results.homes) == 0

Expand Down Expand Up @@ -1609,40 +1572,3 @@ def test_filters_address_without_house_number(self, mock_response):
assert len(results.homes) == 0


class TestSubstituteNuxtVars:
def test_replaces_variables(self):
js = '{street:a,city:b}'
mapping = {"a": "Kerkstraat", "b": "Amsterdam"}
result = HomeResults._substitute_nuxt_vars(js, mapping)
assert '"Kerkstraat"' in result
assert '"Amsterdam"' in result

def test_preserves_quoted_strings(self):
js = '{name:"hello",value:a}'
mapping = {"a": "world"}
result = HomeResults._substitute_nuxt_vars(js, mapping)
assert '"hello"' in result
assert '"world"' in result

def test_preserves_keywords(self):
js = '{active:true,data:null,value:a}'
mapping = {"a": "test"}
result = HomeResults._substitute_nuxt_vars(js, mapping)
assert 'true' in result
assert 'null' in result

def test_escapes_special_chars_in_values(self):
js = '{name:a}'
mapping = {"a": 'he said "hi"'}
result = HomeResults._substitute_nuxt_vars(js, mapping)
assert '\\"' in result

def test_unmapped_identifier_preserved(self):
js = '{name:unknownVar}'
mapping = {}
result = HomeResults._substitute_nuxt_vars(js, mapping)
assert 'unknownVar' in result

def test_empty_input(self):
result = HomeResults._substitute_nuxt_vars('', {})
assert result == ''
Loading