From e28a563fd7c745962362e4bea51ac793b03a44c3 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 10:30:33 +0800 Subject: [PATCH 1/7] Scrape place reservation provider links --- src/gmaps_scraper/models.py | 17 ++ src/gmaps_scraper/place_scraper.py | 345 +++++++++++++++++++++++++++++ tests/test_place_scraper.py | 129 +++++++++++ 3 files changed, 491 insertions(+) diff --git a/src/gmaps_scraper/models.py b/src/gmaps_scraper/models.py index ab1d538..5688eb8 100644 --- a/src/gmaps_scraper/models.py +++ b/src/gmaps_scraper/models.py @@ -270,6 +270,21 @@ def to_dict(self) -> dict[str, object]: } +@dataclass(slots=True) +class PlaceReservationLink: + """A visible booking or reservation provider link from a place page.""" + + label: str + url: str + + def to_dict(self) -> dict[str, object]: + """Convert a reservation link into a JSON-serializable dictionary.""" + return { + "label": self.label, + "url": self.url, + } + + @dataclass(slots=True) class PlaceDetails: """A parsed Google Maps place page.""" @@ -293,6 +308,7 @@ class PlaceDetails: located_in: str | None = None status: str | None = None website: str | None = None + reservation_links: list[PlaceReservationLink] = field(default_factory=list) phone: str | None = None plus_code: str | None = None address_parts: AddressParts | None = None @@ -334,6 +350,7 @@ def to_dict(self) -> dict[str, object]: "located_in": self.located_in, "status": self.status, "website": self.website, + "reservation_links": [link.to_dict() for link in self.reservation_links], "phone": self.phone, "plus_code": self.plus_code, "address_parts": self.address_parts, diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 2e6783f..f6222e3 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -23,6 +23,7 @@ PlaceDetails, PlaceExtractionDiagnostics, PlaceLLMRepairRequest, + PlaceReservationLink, PlaceReview, PlaceScrapeResult, ReviewTopic, @@ -990,6 +991,76 @@ } return prices; }; + const providerLabelFromUrl = (href) => { + try { + const host = new URL(href).hostname.replace(/^www\./, ""); + const base = host.split(".")[0] || host; + return base + .replace(/[-_]+/g, " ") + .replace(/\b\w/g, (char) => char.toUpperCase()); + } catch { + return "Find a Table"; + } + }; + const reservationLabel = (element, href) => { + const actionPrefixPattern = new RegExp( + "^(?:find a table|reserve|make a reservation|book(?: a table)?)" + + "(?:\\s+(?:with|on|at|via))?\\s*", + "i", + ); + const raw = cleanLine( + element.innerText + || element.textContent + || element.getAttribute("aria-label") + || element.getAttribute("title") + || "", + ); + const cleaned = raw + .replace(actionPrefixPattern, "") + .replace(/\s+(?:opens in new tab|website)$/i, "") + .trim(); + return cleaned || providerLabelFromUrl(href); + }; + const collectReservationLinks = () => { + const links = []; + const seen = new Set(); + const reservationPattern = new RegExp( + String.raw`\b(find a table|reserve|reservation|book(?: a table)?|booking)\b`, + "i", + ); + const providerHostPattern = new RegExp( + "(opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" + + "tablecheck|exploretock|omakase|pocket-concierge|pocketconcierge|" + + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)", + "i", + ); + for (const element of panel.querySelectorAll("a[href]")) { + const href = element.href || element.getAttribute("href") || ""; + if (!/^https?:\/\//i.test(href) || seen.has(href)) { + continue; + } + const evidence = [ + element.innerText, + element.textContent, + element.getAttribute("aria-label"), + element.getAttribute("title"), + element.getAttribute("data-item-id"), + href, + ].filter(Boolean).join(" "); + if (!reservationPattern.test(evidence) && !providerHostPattern.test(evidence)) { + continue; + } + seen.add(href); + links.push({ + label: reservationLabel(element, href), + url: href, + }); + if (links.length >= 8) { + break; + } + } + return links; + }; const roomOverlayPrice = () => { const selectors = [ ".rlmNhf button[aria-label]", @@ -1120,6 +1191,7 @@ located_in: itemValue("locatedin"), status: firstText(["div.OqCZI .ZDu9vd", "div.OqCZI .o0Svhf"]), website: firstAttr(["a[data-item-id='authority']"], "href", document) || itemValue("authority"), + reservation_links: collectReservationLinks(), phone: firstText([ "button[data-item-id^='phone:'] .Io6YTe", "button[data-item-id^='phone:']", @@ -1189,6 +1261,138 @@ return false; } """ +_PLACE_RESERVATION_BUTTON_CLICK_JS = r""" +() => { +""" + _PLACE_PANEL_HELPERS_JS + r""" + + const root = placePanelRoot().root; + const reservationPattern = /\b(find a table|reserve|reservation|book(?: a table)?|booking)\b/i; + const candidates = [ + ...root.querySelectorAll("button, div[role='button']"), + ]; + for (const element of candidates) { + const text = cleanLine(element.innerText || element.textContent || ""); + const ariaLabel = cleanLine(element.getAttribute("aria-label") || ""); + const title = cleanLine(element.getAttribute("title") || ""); + const itemId = cleanLine(element.getAttribute("data-item-id") || ""); + const evidence = `${text} ${ariaLabel} ${title} ${itemId}`; + if (!reservationPattern.test(evidence)) { + continue; + } + const {rect, visibleArea} = visibleRect(element); + if (visibleArea <= 0 || rect.width <= 0 || rect.height <= 0) { + continue; + } + element.click(); + return true; + } + return false; +} +""" +_PLACE_RESERVATION_DIALOG_JS = r""" +() => { +""" + _PLACE_PANEL_HELPERS_JS + r""" + + const providerLabelFromUrl = (href) => { + try { + const host = new URL(href).hostname.replace(/^www\./, ""); + const base = host.split(".")[0] || host; + return base + .replace(/[-_]+/g, " ") + .replace(/\b\w/g, (char) => char.toUpperCase()); + } catch { + return "Find a Table"; + } + }; + const cleanReservationLabel = (value, href) => { + const actionPrefixPattern = new RegExp( + "^(?:find a table|reserve|make a reservation|book(?: a table)?)" + + "(?:\\s+(?:with|on|at|via))?\\s*", + "i", + ); + const cleaned = cleanLine(value) + .replace(actionPrefixPattern, "") + .replace(/\s+(?:opens in new tab|website)$/i, "") + .trim(); + return cleaned || providerLabelFromUrl(href); + }; + const providerHostPattern = new RegExp( + "(opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" + + "tablecheck|exploretock|omakase|pocket-concierge|pocketconcierge|" + + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)", + "i", + ); + const rejectHostPattern = new RegExp( + String.raw`(^|\.)google(?:\.[a-z]{2,}){1,2}$` + + String.raw`|(^|\.)gstatic\.com$` + + String.raw`|(^|\.)googleusercontent\.com$`, + "i", + ); + const dialogs = [ + ...document.querySelectorAll("[role='dialog'], [aria-modal='true']"), + ].filter((element) => { + const {rect, visibleArea} = visibleRect(element); + return visibleArea > 0 && rect.width >= 120 && rect.height >= 80; + }); + const roots = dialogs.length ? dialogs : [document.body]; + const links = []; + const seen = new Set(); + for (const root of roots) { + for (const element of root.querySelectorAll("a[href]")) { + const href = element.href || element.getAttribute("href") || ""; + if (!/^https?:\/\//i.test(href) || seen.has(href)) { + continue; + } + let host = ""; + try { + host = new URL(href).hostname; + } catch { + continue; + } + const rawLabel = [ + element.innerText, + element.textContent, + element.getAttribute("aria-label"), + element.getAttribute("title"), + ].filter(Boolean).join(" "); + const evidence = `${rawLabel} ${href}`; + if (rejectHostPattern.test(host) && !/\/maps\/reserve\b/i.test(href)) { + continue; + } + if (!dialogs.length && !providerHostPattern.test(evidence)) { + continue; + } + seen.add(href); + links.push({ + label: cleanReservationLabel(rawLabel, href), + url: href, + }); + if (links.length >= 8) { + break; + } + } + if (links.length >= 8) { + break; + } + } + for (const dialog of dialogs) { + for (const button of dialog.querySelectorAll("button, div[role='button']")) { + const label = cleanLine( + button.getAttribute("aria-label") + || button.getAttribute("title") + || button.innerText + || button.textContent + || "", + ); + if (/^(close|閉じる|關閉|关闭|닫기)$/i.test(label)) { + button.click(); + return links; + } + } + } + return links; +} +""" _PLACE_REVIEW_TAB_CLICK_JS = r""" () => { """ + _PLACE_PANEL_HELPERS_JS + r""" @@ -2207,6 +2411,16 @@ def _collect_place_snapshot_with_context( dom_snapshot = page.evaluate(_PLACE_JS_EXTRACTOR) if overview_screenshot_path is not None: _write_place_screenshot(page, overview_screenshot_path) + if isinstance(dom_snapshot, Mapping): + reservation_snapshot = _collect_reservation_dialog_snapshot( + page, + timeout_ms=timeout_ms, + ) + if reservation_snapshot: + dom_snapshot = _merge_reservation_links( + dom_snapshot, + reservation_snapshot, + ) if collect_reviews and isinstance(dom_snapshot, Mapping): review_snapshot = _collect_review_panel_snapshot(page, timeout_ms=timeout_ms) if review_snapshot: @@ -2488,6 +2702,23 @@ def _collect_about_panel_snapshot(page: Any, *, timeout_ms: int) -> dict[str, ob return {"about_sections": sections} +def _collect_reservation_dialog_snapshot(page: Any, *, timeout_ms: int) -> dict[str, object]: + try: + clicked = page.evaluate(_PLACE_RESERVATION_BUTTON_CLICK_JS) + except Exception: + return {} + if clicked is not True: + return {} + page.wait_for_timeout(min(max(timeout_ms // 20, 1_000), 1_500)) + try: + reservation_links = page.evaluate(_PLACE_RESERVATION_DIALOG_JS) + except Exception: + return {} + if not isinstance(reservation_links, list): + return {} + return {"reservation_links": reservation_links} + + def _build_place_details( source_url: str, *, @@ -2583,6 +2814,7 @@ def _build_place_details( located_in=_clean_text(snapshot.get("located_in")), status=_clean_text(snapshot.get("status")) or _extract_status_from_lines(combined_lines), website=_normalize_website(snapshot.get("website")), + reservation_links=_normalize_reservation_links(snapshot.get("reservation_links")), phone=_normalize_phone_candidate(snapshot.get("phone")) or _extract_phone_from_lines(combined_lines), plus_code=_clean_plus_code_text(snapshot.get("plus_code")) @@ -2655,6 +2887,33 @@ def _merge_place_sources( return merged +def _merge_reservation_links( + primary: Mapping[str, object], + secondary: Mapping[str, object], +) -> dict[str, object]: + merged = dict(primary) + links: list[object] = [] + seen_urls: set[str] = set() + for source in (primary, secondary): + raw_links = source.get("reservation_links") + if not isinstance(raw_links, list): + continue + for raw_link in raw_links: + if not isinstance(raw_link, Mapping): + continue + raw_url = _clean_text(raw_link.get("url")) + if raw_url is None: + continue + url = _normalize_preview_website(raw_url) + if url is None or url in seen_urls: + continue + seen_urls.add(url) + links.append(raw_link) + if links: + merged["reservation_links"] = links + return merged + + def _merge_ordered_place_sources( *sources: tuple[Mapping[str, object], str], ) -> dict[str, object]: @@ -4526,6 +4785,92 @@ def _normalize_website(value: object) -> str | None: return _normalize_preview_website(text) +def _normalize_reservation_links(value: object) -> list[PlaceReservationLink]: + if not isinstance(value, list): + return [] + links: list[PlaceReservationLink] = [] + seen_urls: set[str] = set() + for item in value: + if not isinstance(item, Mapping): + continue + raw_label = item.get("label") + raw_url = item.get("url") + url = _clean_text(raw_url) + if url is None: + continue + parsed = urlparse(url) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + continue + normalized_url = _normalize_preview_website(url) if "google.com" in parsed.netloc else url + if normalized_url is None: + normalized_url = url if parsed.netloc.endswith("google.com") else None + if normalized_url is None or normalized_url in seen_urls: + continue + label = _clean_reservation_label(raw_label, normalized_url) + links.append(PlaceReservationLink(label=label[:80], url=normalized_url)) + seen_urls.add(normalized_url) + return links + + +def _clean_reservation_label(value: object, url: str) -> str: + fallback = _reservation_provider_label_from_url(url) + label = _clean_text(value) + if label is None: + return fallback + label = re.sub(r"[\ue000-\uf8ff]", " ", label) + label = re.sub( + r"^(?:find a table|reserve|make a reservation|book(?: a table)?)" + r"(?:\s+(?:with|on|at|via))?\s*", + "", + label, + flags=re.IGNORECASE, + ) + label = re.sub(r"\s+(?:opens in new tab|website)$", "", label, flags=re.IGNORECASE) + label = _clean_text(label) + if label is None: + return fallback + + tokens = label.split() + unique_tokens = list(dict.fromkeys(token.casefold() for token in tokens)) + if len(unique_tokens) == 1 and len(tokens) > 1: + label = tokens[0] + if "." in label or re.fullmatch(r"(?:https?://)?[A-Za-z0-9.-]+/?", label): + return fallback + return label + + +def _reservation_provider_label_from_url(url: str) -> str: + host = (urlparse(url).hostname or "").lower().removeprefix("www.") + known_hosts = ( + ("tablecheck.", "TableCheck"), + ("resy.", "Resy"), + ("opentable.", "OpenTable"), + ("sevenrooms.", "SevenRooms"), + ("thefork.", "TheFork"), + ("exploretock.", "Tock"), + ("tock.", "Tock"), + ("quandoo.", "Quandoo"), + ("omakase.", "Omakase"), + ("pocket-concierge.", "Pocket Concierge"), + ("pocketconcierge.", "Pocket Concierge"), + ("tabelog.", "Tabelog"), + ("hotpepper.", "Hot Pepper"), + ("gnavi.", "Gurunavi"), + ("gurunavi.", "Gurunavi"), + ("ikyu.", "Ikyu"), + ("jpneazy.", "JPNEAZY"), + ("byfood.", "ByFood"), + ("autoreserve.", "AutoReserve"), + ) + for marker, label in known_hosts: + if marker in host: + return label + base = host.split(".")[0] if host else "" + if not base: + return "Find a Table" + return base.replace("-", " ").replace("_", " ").title() + + def _extract_coordinate_from_url(url: str, *, index: int) -> float | None: match = re.search(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)", url) if match is None: diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 2a4e450..12d239d 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -15,6 +15,8 @@ _PLACE_ABOUT_TAB_CLICK_JS, _PLACE_DETAIL_READY_JS, _PLACE_JS_EXTRACTOR, + _PLACE_RESERVATION_BUTTON_CLICK_JS, + _PLACE_RESERVATION_DIALOG_JS, _PLACE_REVIEW_TAB_CLICK_JS, _PLACE_REVIEW_TOPIC_JS, _PLACE_SEARCH_RESULT_CLICK_JS, @@ -26,6 +28,7 @@ _clean_category_text, _clean_description_text, _clean_name_text, + _collect_reservation_dialog_snapshot, _extract_address_from_lines, _extract_admission_price_from_lines, _extract_preview_address, @@ -40,10 +43,12 @@ _looks_like_google_maps_place_url, _merge_llm_place_fields, _merge_place_sources, + _merge_reservation_links, _normalize_google_place_id, _normalize_phone_candidate, _normalize_photo_url, _normalize_preview_website, + _normalize_reservation_links, _normalize_review_topics, _normalize_reviews, _normalize_website, @@ -167,6 +172,66 @@ def close(self) -> None: review_signal.assert_not_called() self.assertEqual(screenshot_path.read_bytes(), b"screenshot") + def test_collect_reservation_dialog_snapshot_clicks_and_reads_provider_links(self) -> None: + class _FakePage: + def __init__(self) -> None: + self.waited: list[int] = [] + + def evaluate(self, script: object) -> object: + if script == _PLACE_RESERVATION_BUTTON_CLICK_JS: + return True + if script == _PLACE_RESERVATION_DIALOG_JS: + return [{"label": "TableCheck", "url": "https://www.tablecheck.com/example"}] + return None + + def wait_for_timeout(self, value: int) -> None: + self.waited.append(value) + + page = _FakePage() + + self.assertEqual( + _collect_reservation_dialog_snapshot(page, timeout_ms=30_000), + {"reservation_links": [{"label": "TableCheck", "url": "https://www.tablecheck.com/example"}]}, + ) + self.assertEqual(page.waited, [1_500]) + + def test_collect_reservation_dialog_snapshot_skips_when_no_button(self) -> None: + class _FakePage: + def __init__(self) -> None: + self.waited: list[int] = [] + + def evaluate(self, script: object) -> object: + if script == _PLACE_RESERVATION_BUTTON_CLICK_JS: + return False + raise AssertionError("dialog should not be read") + + def wait_for_timeout(self, value: int) -> None: + self.waited.append(value) + + page = _FakePage() + + self.assertEqual(_collect_reservation_dialog_snapshot(page, timeout_ms=30_000), {}) + self.assertEqual(page.waited, []) + + def test_merge_reservation_links_dedupes_overview_and_dialog_links(self) -> None: + merged = _merge_reservation_links( + {"reservation_links": [{"label": "Resy", "url": "https://resy.com/example"}]}, + { + "reservation_links": [ + {"label": "Resy duplicate", "url": "https://resy.com/example"}, + {"label": "TableCheck", "url": "https://www.tablecheck.com/example"}, + ] + }, + ) + + self.assertEqual( + merged["reservation_links"], + [ + {"label": "Resy", "url": "https://resy.com/example"}, + {"label": "TableCheck", "url": "https://www.tablecheck.com/example"}, + ], + ) + def test_scrape_places_reuses_context_and_retries_quality_flags(self) -> None: class _FakeContext: def __init__(self) -> None: @@ -2158,6 +2223,29 @@ def test_build_place_details_preserves_photo_url(self) -> None: "https://lh3.googleusercontent.com/p/example=s680-w680-h510", ) + def test_build_place_details_preserves_reservation_links(self) -> None: + details = _build_place_details( + "https://www.google.com/maps/place/Open+Kitchen", + resolved_url="https://www.google.com/maps/place/Open+Kitchen", + snapshot={ + "name": "Open Kitchen", + "reservation_links": [ + {"label": "TableCheck", "url": "https://www.tablecheck.com/example"}, + {"label": "Bad", "url": "javascript:alert(1)"}, + ], + "body_text": "Open Kitchen", + }, + ) + + self.assertEqual( + [link.to_dict() for link in details.reservation_links], + [{"label": "TableCheck", "url": "https://www.tablecheck.com/example"}], + ) + self.assertEqual( + details.to_dict()["reservation_links"], + [{"label": "TableCheck", "url": "https://www.tablecheck.com/example"}], + ) + def test_build_place_details_preserves_google_place_id(self) -> None: details = _build_place_details( "https://www.google.com/maps/place/Den", @@ -2642,6 +2730,47 @@ def test_normalize_website_rejects_non_http_urls(self) -> None: self.assertIsNone(_normalize_website("mailto:test@example.com")) self.assertIsNone(_normalize_website("example.com")) + def test_normalize_reservation_links_keeps_http_provider_links(self) -> None: + links = _normalize_reservation_links( + [ + {"label": "Resy", "url": "https://resy.com/cities/lisbon/example"}, + {"label": "Duplicate", "url": "https://resy.com/cities/lisbon/example"}, + {"label": "Bad", "url": "javascript:alert(1)"}, + ] + ) + + self.assertEqual( + [link.to_dict() for link in links], + [{"label": "Resy", "url": "https://resy.com/cities/lisbon/example"}], + ) + + def test_normalize_reservation_links_cleans_google_dialog_labels(self) -> None: + links = _normalize_reservation_links( + [ + { + "label": "\ue878 Find a table tablecheck.com", + "url": "https://www.tablecheck.com/markstokyo/reserve", + }, + { + "label": "ikyu.com ikyu.com ikyu.com", + "url": "https://restaurant.ikyu.com/112767/?ikgo=2", + }, + { + "label": "autoreserve.com autoreserve.com autoreserve.com", + "url": "https://autoreserve.com/restaurants/example", + }, + ] + ) + + self.assertEqual( + [link.to_dict() for link in links], + [ + {"label": "TableCheck", "url": "https://www.tablecheck.com/markstokyo/reserve"}, + {"label": "Ikyu", "url": "https://restaurant.ikyu.com/112767/?ikgo=2"}, + {"label": "AutoReserve", "url": "https://autoreserve.com/restaurants/example"}, + ], + ) + def test_merge_place_sources_only_backfills_missing_fields(self) -> None: merged = _merge_place_sources( { From 64ac31ebb356721fac02747d442488d755b7c9ca Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 11:15:03 +0800 Subject: [PATCH 2/7] Prefer direct reservation provider links --- src/gmaps_scraper/place_scraper.py | 11 +++++++++++ tests/test_place_scraper.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index f6222e3..a9cee8a 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -4809,9 +4809,20 @@ def _normalize_reservation_links(value: object) -> list[PlaceReservationLink]: label = _clean_reservation_label(raw_label, normalized_url) links.append(PlaceReservationLink(label=label[:80], url=normalized_url)) seen_urls.add(normalized_url) + if any(not _reservation_link_is_google_reserve(link.url) for link in links): + links = [link for link in links if not _reservation_link_is_google_reserve(link.url)] return links +def _reservation_link_is_google_reserve(url: str) -> bool: + parsed = urlparse(url) + host = parsed.netloc.lower() + return ( + host in {"www.google.com", "google.com", "maps.google.com"} + and parsed.path.startswith("/maps/reserve") + ) + + def _clean_reservation_label(value: object, url: str) -> str: fallback = _reservation_provider_label_from_url(url) label = _clean_text(value) diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 12d239d..7de57be 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -2759,6 +2759,10 @@ def test_normalize_reservation_links_cleans_google_dialog_labels(self) -> None: "label": "autoreserve.com autoreserve.com autoreserve.com", "url": "https://autoreserve.com/restaurants/example", }, + { + "label": "Reserve a table", + "url": "https://www.google.com/maps/reserve/v/dine/c/example", + }, ] ) From 7370830c63f9f970afded22fe1b469066023035e Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 16:03:56 +0800 Subject: [PATCH 3/7] Capture provider popup reservation links --- src/gmaps_scraper/place_scraper.py | 21 +++++++++++++++++++-- tests/test_place_scraper.py | 10 ++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index a9cee8a..15e5ce5 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -1334,7 +1334,23 @@ const {rect, visibleArea} = visibleRect(element); return visibleArea > 0 && rect.width >= 120 && rect.height >= 80; }); - const roots = dialogs.length ? dialogs : [document.body]; + const providerPanels = [ + ...document.querySelectorAll("div, section"), + ].filter((element) => { + const {rect, visibleArea} = visibleRect(element); + if (visibleArea <= 0 || rect.width < 120 || rect.height < 80) { + return false; + } + const text = cleanLine(element.innerText || element.textContent || ""); + return /\bcontinue with\b/i.test(text) && /\babout these providers\b/i.test(text); + }).sort((left, right) => { + const leftRect = left.getBoundingClientRect(); + const rightRect = right.getBoundingClientRect(); + return (leftRect.width * leftRect.height) - (rightRect.width * rightRect.height); + }); + const providerRoots = dialogs.length ? dialogs : providerPanels.slice(0, 1); + const roots = providerRoots.length ? providerRoots : [document.body]; + const hasTrustedProviderRoot = providerRoots.length > 0; const links = []; const seen = new Set(); for (const root of roots) { @@ -1359,7 +1375,7 @@ if (rejectHostPattern.test(host) && !/\/maps\/reserve\b/i.test(href)) { continue; } - if (!dialogs.length && !providerHostPattern.test(evidence)) { + if (!hasTrustedProviderRoot && !providerHostPattern.test(evidence)) { continue; } seen.add(href); @@ -4872,6 +4888,7 @@ def _reservation_provider_label_from_url(url: str) -> str: ("jpneazy.", "JPNEAZY"), ("byfood.", "ByFood"), ("autoreserve.", "AutoReserve"), + ("sg-management.", "SG Management"), ) for marker, label in known_hosts: if marker in host: diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 7de57be..f1840fb 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -195,6 +195,11 @@ def wait_for_timeout(self, value: int) -> None: ) self.assertEqual(page.waited, [1_500]) + def test_reservation_dialog_extractor_trusts_provider_popup_without_dialog_role(self) -> None: + self.assertIn("continue with", _PLACE_RESERVATION_DIALOG_JS) + self.assertIn("about these providers", _PLACE_RESERVATION_DIALOG_JS) + self.assertIn("hasTrustedProviderRoot", _PLACE_RESERVATION_DIALOG_JS) + def test_collect_reservation_dialog_snapshot_skips_when_no_button(self) -> None: class _FakePage: def __init__(self) -> None: @@ -2759,6 +2764,10 @@ def test_normalize_reservation_links_cleans_google_dialog_labels(self) -> None: "label": "autoreserve.com autoreserve.com autoreserve.com", "url": "https://autoreserve.com/restaurants/example", }, + { + "label": "\ue157 sg-management.jp \ue157sg-management.jp sg-management.jp", + "url": "https://sg-management.jp/reserve/", + }, { "label": "Reserve a table", "url": "https://www.google.com/maps/reserve/v/dine/c/example", @@ -2772,6 +2781,7 @@ def test_normalize_reservation_links_cleans_google_dialog_labels(self) -> None: {"label": "TableCheck", "url": "https://www.tablecheck.com/markstokyo/reserve"}, {"label": "Ikyu", "url": "https://restaurant.ikyu.com/112767/?ikgo=2"}, {"label": "AutoReserve", "url": "https://autoreserve.com/restaurants/example"}, + {"label": "SG Management", "url": "https://sg-management.jp/reserve/"}, ], ) From 2580f5025e528c3cf401da508b25db7b090c672e Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 18:14:54 +0800 Subject: [PATCH 4/7] Close reservation provider popups --- src/gmaps_scraper/place_scraper.py | 36 +++++++++++++++++++----------- tests/test_place_scraper.py | 2 ++ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 15e5ce5..7118b0a 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -1351,6 +1351,24 @@ const providerRoots = dialogs.length ? dialogs : providerPanels.slice(0, 1); const roots = providerRoots.length ? providerRoots : [document.body]; const hasTrustedProviderRoot = providerRoots.length > 0; + const closeProviderRoot = (root) => { + const closePattern = /(^|\b)(close|dismiss|cancel|modal-close|閉じ|關閉|关闭|닫기|닫|×)(\b|$)/i; + const closeCandidates = "button, div[role='button'], [aria-label], [title]"; + for (const button of root.querySelectorAll(closeCandidates)) { + const label = cleanLine([ + button.getAttribute("aria-label"), + button.getAttribute("title"), + button.innerText, + button.textContent, + button.className, + ].filter(Boolean).join(" ")); + if (closePattern.test(label)) { + button.click(); + return true; + } + } + return false; + }; const links = []; const seen = new Set(); for (const root of roots) { @@ -1391,21 +1409,13 @@ break; } } - for (const dialog of dialogs) { - for (const button of dialog.querySelectorAll("button, div[role='button']")) { - const label = cleanLine( - button.getAttribute("aria-label") - || button.getAttribute("title") - || button.innerText - || button.textContent - || "", - ); - if (/^(close|閉じる|關閉|关闭|닫기)$/i.test(label)) { - button.click(); - return links; - } + for (const root of providerRoots) { + if (closeProviderRoot(root)) { + return links; } } + document.dispatchEvent(new KeyboardEvent("keydown", { key: "Escape", bubbles: true })); + window.dispatchEvent(new KeyboardEvent("keydown", { key: "Escape", bubbles: true })); return links; } """ diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index f1840fb..daa108b 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -199,6 +199,8 @@ def test_reservation_dialog_extractor_trusts_provider_popup_without_dialog_role( self.assertIn("continue with", _PLACE_RESERVATION_DIALOG_JS) self.assertIn("about these providers", _PLACE_RESERVATION_DIALOG_JS) self.assertIn("hasTrustedProviderRoot", _PLACE_RESERVATION_DIALOG_JS) + self.assertIn("closeProviderRoot", _PLACE_RESERVATION_DIALOG_JS) + self.assertIn("KeyboardEvent", _PLACE_RESERVATION_DIALOG_JS) def test_collect_reservation_dialog_snapshot_skips_when_no_button(self) -> None: class _FakePage: From 845a07a11ceae97902274cb1d0bb660c9c2d2f63 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 21:59:47 +0800 Subject: [PATCH 5/7] Avoid generic booking reservation matches --- src/gmaps_scraper/place_scraper.py | 10 +++++----- tests/test_place_scraper.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 7118b0a..5ed6a4a 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -1004,7 +1004,7 @@ }; const reservationLabel = (element, href) => { const actionPrefixPattern = new RegExp( - "^(?:find a table|reserve|make a reservation|book(?: a table)?)" + "^(?:find a table|reserve|make a reservation|book a table)" + "(?:\\s+(?:with|on|at|via))?\\s*", "i", ); @@ -1025,7 +1025,7 @@ const links = []; const seen = new Set(); const reservationPattern = new RegExp( - String.raw`\b(find a table|reserve|reservation|book(?: a table)?|booking)\b`, + String.raw`\b(find a table|reserve|reservation|book a table)\b`, "i", ); const providerHostPattern = new RegExp( @@ -1266,7 +1266,7 @@ """ + _PLACE_PANEL_HELPERS_JS + r""" const root = placePanelRoot().root; - const reservationPattern = /\b(find a table|reserve|reservation|book(?: a table)?|booking)\b/i; + const reservationPattern = /\b(find a table|reserve|reservation|book a table)\b/i; const candidates = [ ...root.querySelectorAll("button, div[role='button']"), ]; @@ -1306,7 +1306,7 @@ }; const cleanReservationLabel = (value, href) => { const actionPrefixPattern = new RegExp( - "^(?:find a table|reserve|make a reservation|book(?: a table)?)" + "^(?:find a table|reserve|make a reservation|book a table)" + "(?:\\s+(?:with|on|at|via))?\\s*", "i", ); @@ -4856,7 +4856,7 @@ def _clean_reservation_label(value: object, url: str) -> str: return fallback label = re.sub(r"[\ue000-\uf8ff]", " ", label) label = re.sub( - r"^(?:find a table|reserve|make a reservation|book(?: a table)?)" + r"^(?:find a table|reserve|make a reservation|book a table)" r"(?:\s+(?:with|on|at|via))?\s*", "", label, diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index daa108b..97807f0 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -202,6 +202,16 @@ def test_reservation_dialog_extractor_trusts_provider_popup_without_dialog_role( self.assertIn("closeProviderRoot", _PLACE_RESERVATION_DIALOG_JS) self.assertIn("KeyboardEvent", _PLACE_RESERVATION_DIALOG_JS) + def test_reservation_extractors_do_not_match_generic_booking_copy(self) -> None: + for script in ( + _PLACE_JS_EXTRACTOR, + _PLACE_RESERVATION_BUTTON_CLICK_JS, + _PLACE_RESERVATION_DIALOG_JS, + ): + self.assertIn("book a table", script) + self.assertNotIn("|booking", script) + self.assertNotIn("book(?: a table)?", script) + def test_collect_reservation_dialog_snapshot_skips_when_no_button(self) -> None: class _FakePage: def __init__(self) -> None: From eff4af322ad478e047238ce0a7bd220edf362c5c Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 23:29:39 +0800 Subject: [PATCH 6/7] Tighten reservation link normalization --- src/gmaps_scraper/place_scraper.py | 88 +++++++++++++++++++++--------- tests/test_place_scraper.py | 84 ++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 26 deletions(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 5ed6a4a..44b4b9f 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -1029,11 +1029,19 @@ "i", ); const providerHostPattern = new RegExp( - "(opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" + "(^|[.-])(?:opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" + "tablecheck|exploretock|omakase|pocket-concierge|pocketconcierge|" - + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)", + + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)" + + "([.-]|$)", "i", ); + const providerHostMatches = (href) => { + try { + return providerHostPattern.test(new URL(href).hostname); + } catch { + return false; + } + }; for (const element of panel.querySelectorAll("a[href]")) { const href = element.href || element.getAttribute("href") || ""; if (!/^https?:\/\//i.test(href) || seen.has(href)) { @@ -1045,9 +1053,8 @@ element.getAttribute("aria-label"), element.getAttribute("title"), element.getAttribute("data-item-id"), - href, ].filter(Boolean).join(" "); - if (!reservationPattern.test(evidence) && !providerHostPattern.test(evidence)) { + if (!reservationPattern.test(evidence) && !providerHostMatches(href)) { continue; } seen.add(href); @@ -1317,11 +1324,19 @@ return cleaned || providerLabelFromUrl(href); }; const providerHostPattern = new RegExp( - "(opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" + "(^|[.-])(?:opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" + "tablecheck|exploretock|omakase|pocket-concierge|pocketconcierge|" - + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)", + + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)" + + "([.-]|$)", "i", ); + const providerHostMatches = (href) => { + try { + return providerHostPattern.test(new URL(href).hostname); + } catch { + return false; + } + }; const rejectHostPattern = new RegExp( String.raw`(^|\.)google(?:\.[a-z]{2,}){1,2}$` + String.raw`|(^|\.)gstatic\.com$` @@ -1346,7 +1361,7 @@ }).sort((left, right) => { const leftRect = left.getBoundingClientRect(); const rightRect = right.getBoundingClientRect(); - return (leftRect.width * leftRect.height) - (rightRect.width * rightRect.height); + return (rightRect.width * rightRect.height) - (leftRect.width * leftRect.height); }); const providerRoots = dialogs.length ? dialogs : providerPanels.slice(0, 1); const roots = providerRoots.length ? providerRoots : [document.body]; @@ -1389,11 +1404,10 @@ element.getAttribute("aria-label"), element.getAttribute("title"), ].filter(Boolean).join(" "); - const evidence = `${rawLabel} ${href}`; if (rejectHostPattern.test(host) && !/\/maps\/reserve\b/i.test(href)) { continue; } - if (!hasTrustedProviderRoot && !providerHostPattern.test(evidence)) { + if (!hasTrustedProviderRoot && !providerHostMatches(href)) { continue; } seen.add(href); @@ -2930,11 +2944,11 @@ def _merge_reservation_links( raw_url = _clean_text(raw_link.get("url")) if raw_url is None: continue - url = _normalize_preview_website(raw_url) + url = _normalize_reservation_url(raw_url) if url is None or url in seen_urls: continue seen_urls.add(url) - links.append(raw_link) + links.append({**raw_link, "url": url}) if links: merged["reservation_links"] = links return merged @@ -4144,21 +4158,51 @@ def _normalize_preview_website(value: str) -> str | None: parsed = urlparse(value) if parsed.scheme not in {"http", "https"}: return None - if parsed.netloc.endswith("google.com") or parsed.netloc.endswith("gstatic.com"): + host = (parsed.hostname or "").lower() + if _is_google_host(host) or _host_matches_domain(host, "gstatic.com"): query = parse_qs(parsed.query) target = query.get("q", [None])[0] if target is None: return None return _normalize_preview_website(unquote(target)) - if "googleusercontent.com" in parsed.netloc: + if _host_matches_domain(host, "googleusercontent.com"): return None - if "streetviewpixels-pa.googleapis.com" in parsed.netloc: + if _host_matches_domain(host, "streetviewpixels-pa.googleapis.com"): return None - if parsed.netloc.endswith("inline.app"): + if _host_matches_domain(host, "inline.app"): return None return value +def _host_matches_domain(host: str, domain: str) -> bool: + return host == domain or host.endswith(f".{domain}") + + +def _is_google_host(host: str) -> bool: + return re.search(r"(^|\.)google(?:\.[a-z0-9-]+){1,2}$", host) is not None + + +def _normalize_reservation_url(value: object) -> str | None: + url = _clean_text(value) + if url is None: + return None + parsed = urlparse(url) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + return None + host = (parsed.hostname or "").lower() + if _is_google_host(host) or _host_matches_domain(host, "gstatic.com"): + query = parse_qs(parsed.query) + target = query.get("q", [None])[0] + if target is not None: + return _normalize_reservation_url(unquote(target)) + return url if _reservation_link_is_google_reserve(url) else None + if _host_matches_domain(host, "googleusercontent.com"): + return None + if _host_matches_domain(host, "streetviewpixels-pa.googleapis.com"): + return None + return url + + def _normalize_photo_url(value: object) -> str | None: normalized = _clean_text(value) if normalized is None: @@ -4824,12 +4868,7 @@ def _normalize_reservation_links(value: object) -> list[PlaceReservationLink]: url = _clean_text(raw_url) if url is None: continue - parsed = urlparse(url) - if parsed.scheme not in {"http", "https"} or not parsed.netloc: - continue - normalized_url = _normalize_preview_website(url) if "google.com" in parsed.netloc else url - if normalized_url is None: - normalized_url = url if parsed.netloc.endswith("google.com") else None + normalized_url = _normalize_reservation_url(url) if normalized_url is None or normalized_url in seen_urls: continue label = _clean_reservation_label(raw_label, normalized_url) @@ -4842,11 +4881,8 @@ def _normalize_reservation_links(value: object) -> list[PlaceReservationLink]: def _reservation_link_is_google_reserve(url: str) -> bool: parsed = urlparse(url) - host = parsed.netloc.lower() - return ( - host in {"www.google.com", "google.com", "maps.google.com"} - and parsed.path.startswith("/maps/reserve") - ) + host = (parsed.hostname or "").lower() + return _is_google_host(host) and parsed.path.startswith("/maps/reserve") def _clean_reservation_label(value: object, url: str) -> str: diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 97807f0..25a1d12 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -200,8 +200,18 @@ def test_reservation_dialog_extractor_trusts_provider_popup_without_dialog_role( self.assertIn("about these providers", _PLACE_RESERVATION_DIALOG_JS) self.assertIn("hasTrustedProviderRoot", _PLACE_RESERVATION_DIALOG_JS) self.assertIn("closeProviderRoot", _PLACE_RESERVATION_DIALOG_JS) + self.assertIn( + "return (rightRect.width * rightRect.height) - (leftRect.width * leftRect.height);", + _PLACE_RESERVATION_DIALOG_JS, + ) self.assertIn("KeyboardEvent", _PLACE_RESERVATION_DIALOG_JS) + def test_reservation_extractors_match_provider_hosts_with_boundaries(self) -> None: + for script in (_PLACE_JS_EXTRACTOR, _PLACE_RESERVATION_DIALOG_JS): + self.assertIn("providerHostMatches", script) + self.assertIn("(^|[.-])(?:opentable|resy|sevenrooms", script) + self.assertNotIn("providerHostPattern.test(evidence)", script) + def test_reservation_extractors_do_not_match_generic_booking_copy(self) -> None: for script in ( _PLACE_JS_EXTRACTOR, @@ -249,6 +259,24 @@ def test_merge_reservation_links_dedupes_overview_and_dialog_links(self) -> None ], ) + def test_merge_reservation_links_stores_normalized_redirect_urls(self) -> None: + merged = _merge_reservation_links( + { + "reservation_links": [ + { + "label": "Inline", + "url": "https://www.google.com:443/url?q=https%3A%2F%2Finline.app%2Fbooking%2Ffoo", + } + ] + }, + {"reservation_links": []}, + ) + + self.assertEqual( + merged["reservation_links"], + [{"label": "Inline", "url": "https://inline.app/booking/foo"}], + ) + def test_scrape_places_reuses_context_and_retries_quality_flags(self) -> None: class _FakeContext: def __init__(self) -> None: @@ -2797,6 +2825,62 @@ def test_normalize_reservation_links_cleans_google_dialog_labels(self) -> None: ], ) + def test_normalize_reservation_links_unwraps_provider_redirects(self) -> None: + links = _normalize_reservation_links( + [ + { + "label": "Find a table Inline", + "url": "https://www.google.com:443/url?q=https%3A%2F%2Finline.app%2Fbooking%2Ffoo", + } + ] + ) + + self.assertEqual( + [link.to_dict() for link in links], + [{"label": "Inline", "url": "https://inline.app/booking/foo"}], + ) + + def test_normalize_reservation_links_drops_google_reserve_cctld(self) -> None: + links = _normalize_reservation_links( + [ + { + "label": "Reserve a table", + "url": "https://www.google.com.sg/maps/reserve/v/dine/c/example", + }, + { + "label": "TableCheck", + "url": "https://www.tablecheck.com/example", + }, + ] + ) + + self.assertEqual( + [link.to_dict() for link in links], + [{"label": "TableCheck", "url": "https://www.tablecheck.com/example"}], + ) + + def test_normalize_reservation_links_ignores_google_substring_hosts(self) -> None: + links = _normalize_reservation_links( + [ + { + "label": "Reserve", + "url": "https://evilgoogle.com/maps/reserve/v/dine/c/example", + }, + { + "label": "TableCheck", + "url": "https://www.tablecheck.com/example", + }, + ] + ) + + self.assertEqual( + [link.url for link in links], + [ + "https://evilgoogle.com/maps/reserve/v/dine/c/example", + "https://www.tablecheck.com/example", + ], + ) + def test_merge_place_sources_only_backfills_missing_fields(self) -> None: merged = _merge_place_sources( { From 2fe4beb11896743194671b4d60ecf602ee466db0 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 11 Jun 2026 23:35:48 +0800 Subject: [PATCH 7/7] Require reservation evidence for overview links --- src/gmaps_scraper/place_scraper.py | 16 +--------------- tests/test_place_scraper.py | 9 +++++---- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 44b4b9f..e1c7382 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -1028,20 +1028,6 @@ String.raw`\b(find a table|reserve|reservation|book a table)\b`, "i", ); - const providerHostPattern = new RegExp( - "(^|[.-])(?:opentable|resy|sevenrooms|thefork|tock|quandoo|yelp|inline|" - + "tablecheck|exploretock|omakase|pocket-concierge|pocketconcierge|" - + "tabelog|hotpepper|gnavi|gurunavi|ikyu|jpneazy|byfood|autoreserve)" - + "([.-]|$)", - "i", - ); - const providerHostMatches = (href) => { - try { - return providerHostPattern.test(new URL(href).hostname); - } catch { - return false; - } - }; for (const element of panel.querySelectorAll("a[href]")) { const href = element.href || element.getAttribute("href") || ""; if (!/^https?:\/\//i.test(href) || seen.has(href)) { @@ -1054,7 +1040,7 @@ element.getAttribute("title"), element.getAttribute("data-item-id"), ].filter(Boolean).join(" "); - if (!reservationPattern.test(evidence) && !providerHostMatches(href)) { + if (!reservationPattern.test(evidence)) { continue; } seen.add(href); diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 25a1d12..3c217b4 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -207,10 +207,11 @@ def test_reservation_dialog_extractor_trusts_provider_popup_without_dialog_role( self.assertIn("KeyboardEvent", _PLACE_RESERVATION_DIALOG_JS) def test_reservation_extractors_match_provider_hosts_with_boundaries(self) -> None: - for script in (_PLACE_JS_EXTRACTOR, _PLACE_RESERVATION_DIALOG_JS): - self.assertIn("providerHostMatches", script) - self.assertIn("(^|[.-])(?:opentable|resy|sevenrooms", script) - self.assertNotIn("providerHostPattern.test(evidence)", script) + self.assertNotIn("providerHostMatches", _PLACE_JS_EXTRACTOR) + self.assertIn("if (!reservationPattern.test(evidence))", _PLACE_JS_EXTRACTOR) + self.assertIn("providerHostMatches", _PLACE_RESERVATION_DIALOG_JS) + self.assertIn("(^|[.-])(?:opentable|resy|sevenrooms", _PLACE_RESERVATION_DIALOG_JS) + self.assertNotIn("providerHostPattern.test(evidence)", _PLACE_RESERVATION_DIALOG_JS) def test_reservation_extractors_do_not_match_generic_booking_copy(self) -> None: for script in (