From 0dc28eea75f65c5a738cf8651c9c7f7c1e962f3f Mon Sep 17 00:00:00 2001 From: Builder Date: Thu, 14 May 2026 07:08:59 +0000 Subject: [PATCH] Align search URLs with upstream sites --- docs/search-url-realism.md | 53 ++++++ scripts/check_search_url_realism.py | 152 ++++++++++++++++++ sites/amazon/app.py | 3 +- sites/amazon/templates/base.html | 4 +- sites/amazon/templates/search.html | 8 +- sites/apple/app.py | 5 +- sites/apple/static/js/main.js | 16 ++ sites/apple/templates/search.html | 2 +- sites/booking/app.py | 3 +- sites/booking/templates/airport_taxis.html | 4 +- sites/booking/templates/attractions.html | 4 +- sites/booking/templates/car_rentals.html | 4 +- sites/booking/templates/flights.html | 4 +- sites/booking/templates/index.html | 4 +- sites/booking/templates/search.html | 4 +- sites/cambridge_dictionary/app.py | 3 + .../cambridge_dictionary/templates/base.html | 3 +- .../templates/thesaurus_index.html | 3 +- sites/coursera/app.py | 2 +- sites/coursera/templates/base.html | 28 ++-- sites/coursera/templates/search.html | 16 +- sites/espn/app.py | 5 +- sites/espn/static/js/main.js | 13 ++ sites/espn/templates/search.html | 2 +- sites/google_map/app.py | 6 +- sites/google_map/static/js/main.js | 13 ++ sites/google_map/templates/_search_bar.html | 2 +- sites/google_map/templates/search.html | 2 +- sites/huggingface/app.py | 1 + sites/huggingface/templates/base.html | 2 +- sites/huggingface/templates/search.html | 2 +- 31 files changed, 316 insertions(+), 57 deletions(-) create mode 100644 docs/search-url-realism.md create mode 100644 scripts/check_search_url_realism.py diff --git a/docs/search-url-realism.md b/docs/search-url-realism.md new file mode 100644 index 0000000..fc1a2de --- /dev/null +++ b/docs/search-url-realism.md @@ -0,0 +1,53 @@ +# Search URL Realism + +## Policy + +Search forms should emit the URL shape used by the real upstream site whenever +that shape is known. Legacy `/search?q=...` routes remain as compatibility +aliases for existing benchmark tasks, hand-written trajectories, and old links. + +This keeps the user-visible behavior realistic without breaking existing local +WebHarbor consumers. + +## Canonical Search URLs + +| Site | Canonical URL | Legacy alias | +| --- | --- | --- | +| Amazon | `/s?k=` | `/search?q=` | +| Booking | `/searchresults.html?ss=` | `/search?q=` | +| Google Maps | `/maps/search/` | `/search?q=` | +| ESPN | `/search/_/q/` | `/search?q=` | +| Apple | `/search/` | `/search?q=` | +| Coursera | `/search?query=` | `/search?q=` | +| Hugging Face | `/search/full-text?q=` | `/search?q=` | +| Cambridge Dictionary | `/search/direct/?datasetsearch=english&q=` | `/search?q=` | +| Cambridge Thesaurus | `/search/english-thesaurus/direct/?datasetsearch=english-thesaurus&q=` | `/thesaurus?q=` | + +Some sites already matched their upstream search shape closely and are +documented rather than changed here: + +- Google Search: `/search?q=` plus vertical parameters such as `tbm=...`. +- GitHub: `/search?q=&type=...`. +- BBC: `/search?q=`. +- arXiv: `/search/?query=&searchtype=...`. +- WolframAlpha: `/input?i=` for computation and `/search?q=...` for + topic search. +- Google Flights: primary flight searches already use `/flights?...`; the + generic `/search?q=...` page is a local airport/city/airline helper. + +HTML forms can only submit query-string values, so path-based canonical search +URLs use a small submit handler that rewrites the destination before navigation. +If JavaScript is unavailable, the route still accepts the form-submitted query +string at the same canonical prefix where possible, and the old alias remains +available. + +## Regression Check + +Run: + +```bash +python3 scripts/check_search_url_realism.py +``` + +The check verifies that the UI emits canonical search URLs and that the legacy +aliases remain wired to the same route handlers. diff --git a/scripts/check_search_url_realism.py b/scripts/check_search_url_realism.py new file mode 100644 index 0000000..e36802f --- /dev/null +++ b/scripts/check_search_url_realism.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""Regression checks for realistic search URL shapes.""" + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + +REQUIRED = [ + ( + "Amazon supports canonical /s?k= search", + "sites/amazon/app.py", + ["@app.route('/s')", "request.args.get('k')"], + ), + ( + "Amazon search UI emits k= on /s", + "sites/amazon/templates/base.html", + ['action="/s"', 'name="k"'], + ), + ( + "Booking supports canonical /searchresults.html?ss=", + "sites/booking/app.py", + ["@app.route('/searchresults.html')", "request.args.get('ss')"], + ), + ( + "Booking search UI emits ss=", + "sites/booking/templates/index.html", + ['action="/searchresults.html"', 'name="ss"'], + ), + ( + "Google Maps supports canonical /maps/search/", + "sites/google_map/app.py", + ['@app.route("/maps/search/")', '@app.route("/maps/search/")'], + ), + ( + "Google Maps search UI emits path search", + "sites/google_map/templates/_search_bar.html", + ['action="/maps/search/"', 'data-path-search="maps"'], + ), + ( + "Google Maps path-search submit handler exists", + "sites/google_map/static/js/main.js", + ['form[data-path-search="maps"]', "'/maps/search/' + encodeURIComponent(query)"], + ), + ( + "ESPN supports canonical /search/_/q/", + "sites/espn/app.py", + ["@app.route('/search/_/q/')"], + ), + ( + "ESPN search UI emits path search", + "sites/espn/templates/search.html", + ['action="/search/_/q/"', 'data-path-search="espn"'], + ), + ( + "ESPN path-search submit handler exists", + "sites/espn/static/js/main.js", + ['form[data-path-search="espn"]', "'/search/_/q/' + encodeURIComponent(query)"], + ), + ( + "Apple supports canonical /search/", + "sites/apple/app.py", + ["@app.route('/search/')"], + ), + ( + "Apple search UI emits path search", + "sites/apple/templates/search.html", + ['action="/search/"', 'data-path-search="apple"'], + ), + ( + "Apple path-search submit handler exists", + "sites/apple/static/js/main.js", + ['form[data-path-search="apple"]', "'/search/' + encodeURIComponent(query)"], + ), + ( + "Coursera supports canonical query= search", + "sites/coursera/app.py", + ["request.args.get('query')"], + ), + ( + "Coursera search UI emits query=", + "sites/coursera/templates/base.html", + ['action="/search"', 'name="query"'], + ), + ( + "Hugging Face supports canonical full-text search", + "sites/huggingface/app.py", + ['@app.route("/search/full-text")'], + ), + ( + "Hugging Face search UI emits full-text path", + "sites/huggingface/templates/base.html", + ['action="/search/full-text"', 'name="q"'], + ), + ( + "Cambridge Dictionary supports direct search path", + "sites/cambridge_dictionary/app.py", + ["@app.route('/search/direct/')", "@app.route('/search/english/direct/')"], + ), + ( + "Cambridge Dictionary search UI emits direct path", + "sites/cambridge_dictionary/templates/base.html", + [ + 'action="{{ \'/search/english-thesaurus/direct/\' if _is_thes else \'/search/direct/\' }}"', + 'name="datasetsearch"', + "'english-thesaurus' if _is_thes else 'english'", + ], + ), +] + +FORBIDDEN = [ + ( + "Amazon nav should not emit legacy q search", + "sites/amazon/templates/base.html", + ['action="{{ url_for(\'search\') }}"', 'name="q" placeholder="Search Amazon"'], + ), + ( + "Booking homepage should not emit legacy q search", + "sites/booking/templates/index.html", + ['action="{{ url_for(\'search\') }}"', 'name="q" placeholder="Where are you going?"'], + ), + ( + "Coursera nav should not emit legacy q search", + "sites/coursera/templates/base.html", + ['name="q" placeholder="What do you want to learn?"'], + ), +] + + +def main(): + failed = False + for label, rel, needles in REQUIRED: + text = (ROOT / rel).read_text() + for needle in needles: + if needle not in text: + print(f"{label}: missing {needle!r} in {rel}") + failed = True + + for label, rel, needles in FORBIDDEN: + text = (ROOT / rel).read_text() + for needle in needles: + if needle in text: + print(f"{label}: forbidden {needle!r} in {rel}") + failed = True + + if failed: + raise SystemExit(1) + print("Search URL realism checks passed") + + +if __name__ == "__main__": + main() diff --git a/sites/amazon/app.py b/sites/amazon/app.py index 7a7b52a..66b36bc 100644 --- a/sites/amazon/app.py +++ b/sites/amazon/app.py @@ -646,8 +646,9 @@ def _match(t): @app.route('/search') +@app.route('/s') def search(): - q = request.args.get('q', '').strip() + q = (request.args.get('q') or request.args.get('k') or '').strip() query_obj = Product.query query_obj = _apply_filters(query_obj) # apply structural filters first candidates = query_obj.all() diff --git a/sites/amazon/templates/base.html b/sites/amazon/templates/base.html index 2568eca..d24520d 100644 --- a/sites/amazon/templates/base.html +++ b/sites/amazon/templates/base.html @@ -24,7 +24,7 @@ - diff --git a/sites/amazon/templates/search.html b/sites/amazon/templates/search.html index 7083273..7a6f82e 100644 --- a/sites/amazon/templates/search.html +++ b/sites/amazon/templates/search.html @@ -5,8 +5,8 @@ {% set qs = request.args %}
diff --git a/sites/espn/app.py b/sites/espn/app.py index b1915b1..44ee0e9 100644 --- a/sites/espn/app.py +++ b/sites/espn/app.py @@ -965,8 +965,9 @@ def article(slug): # ─── Routes: Search ─────────────────────────────────────────────────────────── @app.route('/search') -def search(): - q = request.args.get('q', '').strip() +@app.route('/search/_/q/') +def search(espn_query=''): + q = (espn_query or request.args.get('q', '')).strip() sport_filter = request.args.get('sport', '') type_filter = request.args.get('type', '') # teams, players, articles diff --git a/sites/espn/static/js/main.js b/sites/espn/static/js/main.js index ba8104e..76ff81a 100644 --- a/sites/espn/static/js/main.js +++ b/sites/espn/static/js/main.js @@ -1,6 +1,19 @@ /* ESPN Mirror - Main JavaScript */ document.addEventListener('DOMContentLoaded', function () { + document.querySelectorAll('form[data-path-search="espn"]').forEach(function (form) { + form.addEventListener('submit', function (event) { + const input = form.querySelector('input[name="q"]'); + const query = input ? input.value.trim() : ''; + if (!query) return; + event.preventDefault(); + const params = new URLSearchParams(new FormData(form)); + params.delete('q'); + const suffix = params.toString(); + window.location.href = '/search/_/q/' + encodeURIComponent(query) + (suffix ? '?' + suffix : ''); + }); + }); + // Mobile navigation toggle const menuToggle = document.querySelector('.mobile-menu-toggle'); const navLinks = document.querySelector('.nav-links'); diff --git a/sites/espn/templates/search.html b/sites/espn/templates/search.html index 25967f2..1e86eb7 100644 --- a/sites/espn/templates/search.html +++ b/sites/espn/templates/search.html @@ -7,7 +7,7 @@

Search ESPN

- +