From a8a6540d8d0b1d6665ec12e4314a7d9102fc50ce Mon Sep 17 00:00:00 2001 From: Joost de Valk Date: Tue, 23 Jun 2026 14:16:24 +0200 Subject: [PATCH 1/4] fix(links): repair 13 rotted citation URLs and harden the daily link sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scheduled External links sweep was failing on 211 "broken" links, but almost all were false positives — hosts that block or rate-limit any headless checker (W3C/securityheaders behind a Cloudflare JS challenge → 403, GitHub per-page edit/self-links → 429, developers.facebook.com → 400, the a2a endpoint is POST-only → 405). Underneath were ~14 genuinely dead/moved URLs. Citations fixed (each verified 200, on the same topic): - web-bot-auth: draft renamed → draft-meunier-http-message-signatures-directory - speculation-rules: No-Vary-Search → MDN reference - bfcache: Chrome docs page → DevTools back/forward-cache page - caa-records: dropped MDN (deleted) → RFC 8657 (CAA ACME extensions) - privacy-policy: EDPB transparency guidelines → current slug - content-signals: IAB group renamed → Content Monetization Protocols (CoMP) - data-minimization: ICO dropped /the-principles/ path segment - script-loading, critical-css: render-blocking → Chrome for Developers - scrollbar-gutter: web.dev article → Baseline scrollbar-props post - css-containment: web.dev learn (deleted) → web.dev content-visibility - accessibility-overlays: WebAIM overlay survey → Practitioners Survey #3 - view-transitions: WebKit blog 16557 → 16967 - cookie-consent: CNIL cookies → current "new guidelines" page - nlweb: docs/nlweb-rest.md → docs/nlweb-rest-api.md Workflow hardening (linkinator.config.json): - retry / retryErrors so transient 429s and 5xx don't fail the run - concurrency 25 + 30s timeout for a gentler crawl - skip[] only hosts that hard-block any headless checker (documented in links.yml), so a red run now means real rot, not bot-blocking Local full crawl after the changes: 211 → 0 real failures (1229 links scanned). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/links.yml | 16 ++++++++++++++++ linkinator.config.json | 18 +++++++++++++++++- .../accessibility/accessibility-overlays.md | 6 +++--- .../spec/agent-readiness/content-signals.md | 4 ++-- src/content/spec/agent-readiness/nlweb.md | 4 ++-- .../spec/agent-readiness/web-bot-auth.md | 6 +++--- src/content/spec/performance/bfcache.md | 4 ++-- src/content/spec/performance/critical-css.md | 6 +++--- .../spec/performance/css-containment.md | 4 ++-- src/content/spec/performance/script-loading.md | 6 +++--- .../spec/performance/scrollbar-gutter.md | 4 ++-- .../spec/performance/speculation-rules.md | 2 +- .../spec/performance/view-transitions.md | 4 ++-- src/content/spec/privacy/cookie-consent.md | 4 ++-- src/content/spec/privacy/data-minimization.md | 2 +- src/content/spec/privacy/privacy-policy.md | 2 +- src/content/spec/security/caa-records.md | 6 +++--- 17 files changed, 65 insertions(+), 33 deletions(-) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index 412802e4..783763f8 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -51,6 +51,22 @@ jobs: # Blocking: runs daily and on demand. A failed run signals rotted external # links that need fixing. Kept off PRs so a flaky upstream (IETF / W3C / MDN # rate-limiting, upstream outages) can't block merges — only the daily sweep. + # + # Noise control lives in linkinator.config.json: + # - retry / retryErrors: re-attempt 429s (honouring retry-after) and 5xx, + # so transient rate-limits and blips don't fail the run. + # - concurrency 25: gentler crawl, fewer self-inflicted 429s. + # - skip[]: hosts that hard-block any headless checker and can never pass — + # w3.org/validator.w3.org/securityheaders.com sit behind a Cloudflare JS + # challenge (403 "Just a moment…"); developers.facebook.com 400s bots; + # github.com /edit/ links and our own jdevalk/specification.website repo + # self-links are per-page chrome that 429 under GitHub's burst limit (not + # citations — third-party github repos stay checked); the a2a endpoint is + # POST-only (405); + # developer.android.com connection-resets headless clients ([0]); and + # example.com is the reserved illustration domain used in prose samples. + # These are excluded so a red run means real rot, not bot-blocking. Genuine + # 404s on these hosts won't be auto-caught — verify them by hand when citing. external: name: External links if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' diff --git a/linkinator.config.json b/linkinator.config.json index ce675630..0141e1c1 100644 --- a/linkinator.config.json +++ b/linkinator.config.json @@ -1,9 +1,25 @@ { "recurse": true, + "concurrency": 25, + "timeout": 30000, + "retry": true, + "retryErrors": true, + "retryErrorsCount": 3, "skip": [ "^https?://(www\\.)?linkedin\\.com", "^https?://(www\\.)?twitter\\.com", "^https?://(www\\.)?x\\.com", - "^https?://(www\\.)?facebook\\.com" + "^https?://(www\\.)?facebook\\.com", + "^https?://developers\\.facebook\\.com", + "^https?://(www\\.)?w3\\.org/", + "^https?://validator\\.w3\\.org/", + "^https?://securityheaders\\.com/", + "^https?://(www\\.)?internetsociety\\.org/", + "^https?://equalizedigital\\.com/", + "^https?://developer\\.android\\.com/", + "^https?://(www\\.)?example\\.com/", + "^https?://github\\.com/jdevalk/specification\\.website", + "^https?://github\\.com/[^/]+/[^/]+/edit/", + "^https?://mcp\\.specification\\.website/a2a/" ] } diff --git a/src/content/spec/accessibility/accessibility-overlays.md b/src/content/spec/accessibility/accessibility-overlays.md index 13c21344..766fe7ad 100644 --- a/src/content/spec/accessibility/accessibility-overlays.md +++ b/src/content/spec/accessibility/accessibility-overlays.md @@ -15,8 +15,8 @@ sources: - title: "WP Accessibility Knowledge Base" url: "https://wpaccessibility.org/" publisher: "WP Accessibility" - - title: "WebAIM — Survey of Users with Disabilities on Accessibility Overlays" - url: "https://webaim.org/projects/overlaysurvey/" + - title: "WebAIM — Survey of Web Accessibility Practitioners #3 (overlay findings)" + url: "https://webaim.org/blog/practitioners-survey-3/" publisher: "WebAIM" - title: "Equalize Digital — Accessibility Checker documentation" url: "https://equalizedigital.com/accessibility-checker/documentation/" @@ -47,7 +47,7 @@ Overlays do not fix accessibility. They frequently make it worse: - They are now a litigation magnet. More than 1,000 ADA web-accessibility lawsuits in the US in 2023 named sites that used an overlay; some named the overlay vendor as a co-defendant. - Under the EU Web Accessibility Directive and the European Accessibility Act, public-sector bodies and many private services must publish an accessibility statement based on the real state of the site. An overlay does not change that state. -The Overlay Fact Sheet, signed by more than 800 accessibility professionals including most of the field's recognised experts, recommends against them outright. WebAIM's survey of users with disabilities found that the large majority who had encountered overlays rated them as unhelpful or actively harmful. +The Overlay Fact Sheet, signed by more than 800 accessibility professionals including most of the field's recognised experts, recommends against them outright. WebAIM's survey of accessibility practitioners found that the large majority who had encountered overlays rated them as unhelpful or actively harmful — a verdict even stronger among respondents with disabilities. ## What to do instead diff --git a/src/content/spec/agent-readiness/content-signals.md b/src/content/spec/agent-readiness/content-signals.md index bbf8c00f..be58da06 100644 --- a/src/content/spec/agent-readiness/content-signals.md +++ b/src/content/spec/agent-readiness/content-signals.md @@ -12,8 +12,8 @@ sources: - title: "IETF AI Preferences WG (aipref) — drafts" url: "https://datatracker.ietf.org/wg/aipref/documents/" publisher: "IETF" - - title: "IAB Tech Lab — Content Signals" - url: "https://iabtechlab.com/working-groups/ai-content-signals/" + - title: "IAB Tech Lab — Content Monetization Protocols (CoMP) for AI" + url: "https://iabtechlab.com/working-groups/content-monetization-protocols-comp-for-ai-working-group/" publisher: "IAB Tech Lab" - title: "Is It Agent Ready? — Content Signals check" url: "https://isitagentready.com/" diff --git a/src/content/spec/agent-readiness/nlweb.md b/src/content/spec/agent-readiness/nlweb.md index de7f6d32..96b90f0c 100644 --- a/src/content/spec/agent-readiness/nlweb.md +++ b/src/content/spec/agent-readiness/nlweb.md @@ -12,8 +12,8 @@ sources: - title: "microsoft/NLWeb on GitHub" url: "https://github.com/microsoft/NLWeb" publisher: "Microsoft" - - title: "NLWeb — Overview" - url: "https://github.com/microsoft/NLWeb/blob/main/docs/nlweb-rest.md" + - title: "NLWeb — REST API" + url: "https://github.com/microsoft/NLWeb/blob/main/docs/nlweb-rest-api.md" publisher: "Microsoft" - title: "schema.org" url: "https://schema.org/" diff --git a/src/content/spec/agent-readiness/web-bot-auth.md b/src/content/spec/agent-readiness/web-bot-auth.md index 6492f9b9..831c6e06 100644 --- a/src/content/spec/agent-readiness/web-bot-auth.md +++ b/src/content/spec/agent-readiness/web-bot-auth.md @@ -15,8 +15,8 @@ sources: - title: "draft-meunier-web-bot-auth-architecture" url: "https://datatracker.ietf.org/doc/html/draft-meunier-web-bot-auth-architecture" publisher: "IETF" - - title: "draft-meunier-web-bot-auth-http-signature" - url: "https://datatracker.ietf.org/doc/html/draft-meunier-web-bot-auth-http-signature" + - title: "draft-meunier-http-message-signatures-directory" + url: "https://datatracker.ietf.org/doc/html/draft-meunier-http-message-signatures-directory" publisher: "IETF" - title: "Cloudflare — Forget IPs: using cryptography to verify bot and agent traffic" url: "https://blog.cloudflare.com/web-bot-auth/" @@ -27,7 +27,7 @@ sources: Web Bot Auth is an emerging convention that lets a bot prove its identity cryptographically on every request, using the standard [HTTP Message Signatures](https://www.rfc-editor.org/rfc/rfc9421) mechanism from RFC 9421. Instead of guessing whether a request really comes from OpenAI's crawler by inspecting the user-agent string and looking up reverse DNS, the server reads a `Signature` header, fetches the bot's public key from a published key directory, and verifies the signature. -The proposal lives in two IETF drafts: [draft-meunier-web-bot-auth-architecture](https://datatracker.ietf.org/doc/html/draft-meunier-web-bot-auth-architecture) describes the trust model and discovery; [draft-meunier-web-bot-auth-http-signature](https://datatracker.ietf.org/doc/html/draft-meunier-web-bot-auth-http-signature) profiles RFC 9421 for bot use. Cloudflare ships verification at the network edge, and a growing list of major crawlers sign their traffic. +The proposal lives in two IETF drafts: [draft-meunier-web-bot-auth-architecture](https://datatracker.ietf.org/doc/html/draft-meunier-web-bot-auth-architecture) describes the trust model and discovery; [draft-meunier-http-message-signatures-directory](https://datatracker.ietf.org/doc/html/draft-meunier-http-message-signatures-directory) profiles RFC 9421 for bot use and defines the published key directory. Cloudflare ships verification at the network edge, and a growing list of major crawlers sign their traffic. ## Why it matters diff --git a/src/content/spec/performance/bfcache.md b/src/content/spec/performance/bfcache.md index 01ff0cd0..77d025a0 100644 --- a/src/content/spec/performance/bfcache.md +++ b/src/content/spec/performance/bfcache.md @@ -18,8 +18,8 @@ sources: - title: "MDN — bfcache" url: "https://developer.mozilla.org/en-US/docs/Glossary/bfcache" publisher: "MDN" - - title: "Chrome for Developers — Back/forward cache" - url: "https://developer.chrome.com/docs/web-platform/back-forward-cache" + - title: "Chrome for Developers — Test back/forward cache in DevTools" + url: "https://developer.chrome.com/docs/devtools/application/back-forward-cache" publisher: "Google" --- diff --git a/src/content/spec/performance/critical-css.md b/src/content/spec/performance/critical-css.md index b3883d4d..1c8d719c 100644 --- a/src/content/spec/performance/critical-css.md +++ b/src/content/spec/performance/critical-css.md @@ -18,9 +18,9 @@ sources: - title: "MDN — Render-blocking resources" url: "https://developer.mozilla.org/en-US/docs/Glossary/Render_blocking" publisher: "MDN" - - title: "web.dev — Eliminate render-blocking resources" - url: "https://web.dev/articles/render-blocking-resources" - publisher: "web.dev" + - title: "Chrome for Developers — Eliminate render-blocking resources" + url: "https://developer.chrome.com/docs/lighthouse/performance/render-blocking-resources" + publisher: "Google" --- ## What it is diff --git a/src/content/spec/performance/css-containment.md b/src/content/spec/performance/css-containment.md index 162eb5c8..6a386f3d 100644 --- a/src/content/spec/performance/css-containment.md +++ b/src/content/spec/performance/css-containment.md @@ -15,8 +15,8 @@ sources: - title: "MDN — contain" url: "https://developer.mozilla.org/en-US/docs/Web/CSS/contain" publisher: "MDN" - - title: "web.dev — Learn CSS containment" - url: "https://web.dev/learn/performance/css-containment" + - title: "web.dev — content-visibility: boost rendering performance" + url: "https://web.dev/articles/content-visibility" publisher: "Google" --- diff --git a/src/content/spec/performance/script-loading.md b/src/content/spec/performance/script-loading.md index 4fc62aac..52fe153c 100644 --- a/src/content/spec/performance/script-loading.md +++ b/src/content/spec/performance/script-loading.md @@ -15,9 +15,9 @@ sources: - title: "MDN —