diff --git a/links/check.go b/links/check.go index 8e7ff5a..df8d0ba 100644 --- a/links/check.go +++ b/links/check.go @@ -75,6 +75,7 @@ func checkHTTPLink(rctx types.ResultContext, client *http.Client, url string) ty return rctx.Errorf("%s (invalid URL: %v)", url, err) } req.Header.Set("User-Agent", "skill-validator/1.0") + req.Header.Set("Accept", "text/html, */*;q=0.1") resp, err := client.Do(req) if err != nil { @@ -82,14 +83,43 @@ func checkHTTPLink(rctx types.ResultContext, client *http.Client, url string) ty } defer func() { _ = resp.Body.Close() }() - if resp.StatusCode >= 200 && resp.StatusCode < 300 { - return rctx.Passf("%s (HTTP %d)", url, resp.StatusCode) + // Some sites don't handle HEAD correctly (e.g. SPAs like crates.io return + // 404 for HEAD even though the page exists). Fall back to GET when HEAD + // returns 404 or 405, which is the standard approach used by lychee, + // markdown-link-check, and other link validators. + if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusMethodNotAllowed { + return checkHTTPLinkGET(rctx, client, url) } - if resp.StatusCode >= 300 && resp.StatusCode < 400 { - return rctx.Passf("%s (HTTP %d redirect)", url, resp.StatusCode) + + return classifyResponse(rctx, url, resp.StatusCode) +} + +func checkHTTPLinkGET(rctx types.ResultContext, client *http.Client, url string) types.Result { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return rctx.Errorf("%s (invalid URL: %v)", url, err) + } + req.Header.Set("User-Agent", "skill-validator/1.0") + req.Header.Set("Accept", "text/html, */*;q=0.1") + + resp, err := client.Do(req) + if err != nil { + return rctx.Errorf("%s (request failed: %v)", url, err) + } + defer func() { _ = resp.Body.Close() }() + + return classifyResponse(rctx, url, resp.StatusCode) +} + +func classifyResponse(rctx types.ResultContext, url string, statusCode int) types.Result { + if statusCode >= 200 && statusCode < 300 { + return rctx.Passf("%s (HTTP %d)", url, statusCode) + } + if statusCode >= 300 && statusCode < 400 { + return rctx.Passf("%s (HTTP %d redirect)", url, statusCode) } - if resp.StatusCode == http.StatusForbidden { + if statusCode == http.StatusForbidden { return rctx.Infof("%s (HTTP 403 — may block automated requests)", url) } - return rctx.Errorf("%s (HTTP %d)", url, resp.StatusCode) + return rctx.Errorf("%s (HTTP %d)", url, statusCode) } diff --git a/links/check_test.go b/links/check_test.go index ed9f3a1..3d05a72 100644 --- a/links/check_test.go +++ b/links/check_test.go @@ -98,6 +98,20 @@ func TestCheckLinks_HTTP(t *testing.T) { mux.HandleFunc("/not-found", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNotFound) }) + mux.HandleFunc("/head-404-get-200", func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodHead { + w.WriteHeader(http.StatusNotFound) + return + } + w.WriteHeader(http.StatusOK) + }) + mux.HandleFunc("/head-405-get-200", func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodHead { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + w.WriteHeader(http.StatusOK) + }) mux.HandleFunc("/forbidden", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusForbidden) }) @@ -135,6 +149,20 @@ func TestCheckLinks_HTTP(t *testing.T) { requireResultContaining(t, results, types.Error, "HTTP 500") }) + t.Run("HEAD 404 falls back to GET 200", func(t *testing.T) { + dir := t.TempDir() + body := "[spa](" + server.URL + "/head-404-get-200)" + results := CheckLinks(t.Context(), dir, body) + requireResultContaining(t, results, types.Pass, "HTTP 200") + }) + + t.Run("HEAD 405 falls back to GET 200", func(t *testing.T) { + dir := t.TempDir() + body := "[nohead](" + server.URL + "/head-405-get-200)" + results := CheckLinks(t.Context(), dir, body) + requireResultContaining(t, results, types.Pass, "HTTP 200") + }) + t.Run("mixed relative and HTTP only checks HTTP", func(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "references/guide.md", "content") @@ -225,6 +253,70 @@ func TestCheckHTTPLink(t *testing.T) { requireContains(t, result.Message, "HTTP 403") }) + t.Run("HEAD 404 retries with GET", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodHead { + w.WriteHeader(http.StatusNotFound) + return + } + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + result := checkHTTPLink(types.ResultContext{Category: "Links", File: "SKILL.md"}, client, server.URL) + if result.Level != types.Pass { + t.Errorf("expected Pass after GET fallback, got level=%d message=%q", result.Level, result.Message) + } + }) + + t.Run("HEAD 405 retries with GET", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodHead { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + result := checkHTTPLink(types.ResultContext{Category: "Links", File: "SKILL.md"}, client, server.URL) + if result.Level != types.Pass { + t.Errorf("expected Pass after GET fallback, got level=%d message=%q", result.Level, result.Message) + } + }) + + t.Run("SPA requiring Accept text/html resolves via GET fallback", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Header.Get("Accept") == "" || r.Method == http.MethodHead { + w.WriteHeader(http.StatusNotFound) + return + } + if strings.Contains(r.Header.Get("Accept"), "text/html") { + w.WriteHeader(http.StatusOK) + return + } + w.WriteHeader(http.StatusNotFound) + })) + defer server.Close() + + result := checkHTTPLink(types.ResultContext{Category: "Links", File: "SKILL.md"}, client, server.URL) + if result.Level != types.Pass { + t.Errorf("expected Pass for SPA with Accept header, got level=%d message=%q", result.Level, result.Message) + } + }) + + t.Run("genuine 404 still errors after GET fallback", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + defer server.Close() + + result := checkHTTPLink(types.ResultContext{Category: "Links", File: "SKILL.md"}, client, server.URL) + if result.Level != types.Error { + t.Errorf("expected Error for genuine 404, got level=%d message=%q", result.Level, result.Message) + } + }) + t.Run("invalid URL", func(t *testing.T) { result := checkHTTPLink(types.ResultContext{Category: "Links", File: "SKILL.md"}, client, "http://invalid host with spaces/") if result.Level != types.Error {