From 7f5e8c9c5fe4521149fe8c0313363226fcdded70 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
<161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Fri, 13 Feb 2026 07:22:27 +0000
Subject: [PATCH] Refactor: modularize API handler into internal packages
Decomposed the monolithic `api/index.go` into domain-specific internal packages to improve maintainability, testability, and separation of concerns.
- `internal/transport`: HTTP client configuration and SSRF protection (`NewSafeClient`).
- `internal/article`: Article fetching and parsing logic (`Fetch`).
- `internal/request`: Request parsing, validation, and URL reconstruction (`NormalizeURL`, `ReconstructURL`, `GetFormat`).
- `internal/formatter`: Output formatting and rendering (`Render`, `Template`).
- Updated `api/index.go` to serve as a clean orchestration layer using these packages.
- Migrated and split tests into their respective package test files.
- Added integration test in `api/index_test.go` to verify wiring.
This refactoring adheres to the Single Responsibility Principle and makes the codebase easier to navigate and extend.
Co-authored-by: lucasew <15693688+lucasew@users.noreply.github.com>
---
api/index.go | 444 +-----------------------------
api/index_test.go | 123 +--------
api/llm_test.go | 59 ----
api/reconstruct_test.go | 103 -------
internal/article/fetch.go | 100 +++++++
internal/article/fetch_test.go | 47 ++++
internal/formatter/render.go | 109 ++++++++
internal/request/utils.go | 155 +++++++++++
internal/request/utils_test.go | 170 ++++++++++++
internal/transport/client.go | 64 +++++
internal/transport/client_test.go | 54 ++++
11 files changed, 719 insertions(+), 709 deletions(-)
delete mode 100644 api/llm_test.go
delete mode 100644 api/reconstruct_test.go
create mode 100644 internal/article/fetch.go
create mode 100644 internal/article/fetch_test.go
create mode 100644 internal/formatter/render.go
create mode 100644 internal/request/utils.go
create mode 100644 internal/request/utils_test.go
create mode 100644 internal/transport/client.go
create mode 100644 internal/transport/client_test.go
diff --git a/api/index.go b/api/index.go
index 378c310..7f2ab57 100644
--- a/api/index.go
+++ b/api/index.go
@@ -11,261 +11,20 @@ import (
"bytes"
"context"
"encoding/json"
- "errors"
- "fmt"
- "html/template"
- "io"
"log"
- "math/rand"
- "net"
"net/http"
- "net/url"
- "strings"
- "syscall"
"time"
- "codeberg.org/readeck/go-readability/v2"
- "github.com/mattn/godown"
- "golang.org/x/net/html"
+ "github.com/lucasew/readability-web/internal/article"
+ "github.com/lucasew/readability-web/internal/formatter"
+ "github.com/lucasew/readability-web/internal/request"
+ "github.com/lucasew/readability-web/internal/transport"
)
const (
- maxRedirects = 5
- httpClientTimeout = 10 * time.Second
- maxBodySize = int64(2 * 1024 * 1024) // 2 MiB
- dialerTimeout = 30 * time.Second
- dialerKeepAlive = 30 * time.Second
- handlerTimeout = 5 * time.Second
+ handlerTimeout = 5 * time.Second
)
-/**
- * Template is the raw HTML template string used for rendering the article.
- *
- * It provides a minimal HTML5 structure and includes the Sakura CSS library
- * for a clean, typography-focused reading experience without distractions.
- * The template expects a struct with Title and Content fields.
- */
-const Template = `
-
-
-
-
-
-
-
-
-
- {{.Title}}
- {{.Content}}
-
-
-`
-
-var (
- /**
- * DefaultTemplate is the parsed Go template instance.
- *
- * It is initialized at startup to avoid the overhead of parsing the template
- * on every request, ensuring faster response times.
- */
- DefaultTemplate = template.Must(template.New("article").Parse(Template))
-
- /**
- * ReadabilityParser is the shared instance of the readability parser.
- *
- * It is reusable and thread-safe, allowing concurrent processing of multiple
- * requests without the need to create new parser instances.
- */
- ReadabilityParser = readability.NewParser()
-
- // httpClient used for fetching remote articles with timeouts and redirect policy
- httpClient = &http.Client{
- Transport: &http.Transport{
- DialContext: newSafeDialer().DialContext,
- },
- Timeout: httpClientTimeout,
- CheckRedirect: func(_ *http.Request, via []*http.Request) error {
- if len(via) >= maxRedirects {
- return fmt.Errorf("stopped after %d redirects", maxRedirects)
- }
- return nil
- },
- }
-)
-
-/**
- * newSafeDialer creates a custom net.Dialer that prevents Server-Side Request Forgery (SSRF).
- *
- * It validates the resolved IP address before connecting, ensuring that it is not:
- * - A private network address (e.g., 192.168.x.x, 10.x.x.x)
- * - A loopback address (e.g., 127.0.0.1)
- * - An unspecified address (e.g., 0.0.0.0)
- *
- * This validation happens *after* DNS resolution but *before* the connection is established.
- * This prevents Time-of-Check Time-of-Use (TOCTOU) attacks where a domain could
- * resolve to a safe IP during check but switch to a private IP during connection.
- *
- * This is critical for preventing the application from accessing internal services or metadata services
- * (like AWS EC2 metadata) running on the same network.
- */
-func newSafeDialer() *net.Dialer {
- dialer := &net.Dialer{
- Timeout: dialerTimeout,
- KeepAlive: dialerKeepAlive,
- Control: func(_, address string, _ syscall.RawConn) error {
- host, _, err := net.SplitHostPort(address)
- if err != nil {
- return err
- }
- ips, err := net.LookupIP(host)
- if err != nil {
- return err
- }
- for _, ip := range ips {
- if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() || ip.IsUnspecified() {
- return errors.New("refusing to connect to private network address")
- }
- }
- return nil
- },
- }
- return dialer
-}
-
-/**
- * userAgentPool contains a list of real browser User-Agent strings.
- *
- * We rotate through these to mimic legitimate traffic, as many websites block requests
- * from default HTTP clients (like Go-http-client) or known bot User-Agents.
- * This list requires periodic maintenance to stay current with browser versions.
- */
-var userAgentPool = []string{
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
- "Mozilla/5.0 (iPhone; CPU iPhone OS 18_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Mobile/15E148 Safari/604.1",
-}
-
-/**
- * llmUserAgents contains a list of substring identifiers for known LLM bots and crawlers.
- *
- * This list is used to detect requests from AI agents (like GPTBot, Claude, etc.)
- * so the application can automatically serve a token-efficient format (Markdown)
- * instead of full HTML.
- */
-var llmUserAgents = []string{
- "gptbot",
- "chatgpt",
- "claude",
- "googlebot",
- "bingbot",
- "anthropic",
- "perplexity",
- "claudebot",
- "github-copilot",
-}
-
-/**
- * getRandomUserAgent returns a random User-Agent string from the pool.
- *
- * Rotating User-Agents helps to evade simple anti-bot measures that block requests
- * based on static or default Go HTTP client User-Agents.
- */
-func getRandomUserAgent() string {
- return userAgentPool[rand.Intn(len(userAgentPool))]
-}
-
-/**
- * fetchAndParse retrieves the content from the target URL and parses it using the readability library.
- *
- * Key behaviors:
- * - Spoofs User-Agent and other browser headers to avoid blocking.
- * - Forwards Accept-Language from the client to respect language preferences.
- * - Sets security headers (Sec-Fetch-*) to look like a navigation request.
- * - Limits the response body size to maxBodySize to prevent Out-Of-Memory (OOM) crashes on large pages.
- * - Uses a custom httpClient with SSRF protection.
- */
-func fetchAndParse(ctx context.Context, link *url.URL, r *http.Request) (readability.Article, error) {
- req, err := http.NewRequestWithContext(ctx, "GET", link.String(), nil)
- if err != nil {
- return readability.Article{}, err
- }
-
- // Always spoof everything to look like a real browser
- ua := getRandomUserAgent()
- req.Header.Set("User-Agent", ua)
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
-
- // Fallback headers from client request
- if lang := r.Header.Get("Accept-Language"); lang != "" {
- req.Header.Set("Accept-Language", lang)
- } else {
- req.Header.Set("Accept-Language", "en-US,en;q=0.9")
- }
-
- req.Header.Set("Cache-Control", "no-cache")
- req.Header.Set("Pragma", "no-cache")
- req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
- req.Header.Set("Sec-Fetch-Dest", "document")
- req.Header.Set("Sec-Fetch-Mode", "navigate")
- req.Header.Set("Sec-Fetch-Site", "none")
- req.Header.Set("Sec-Fetch-User", "?1")
- req.Header.Set("Upgrade-Insecure-Requests", "1")
-
- res, err := httpClient.Do(req)
- if err != nil {
- return readability.Article{}, err
- }
- defer res.Body.Close()
-
- // limit body size to prevent OOM
- reader := io.LimitReader(res.Body, maxBodySize)
- node, err := html.Parse(reader)
- if err != nil {
- return readability.Article{}, err
- }
-
- return ReadabilityParser.ParseDocument(node, link)
-}
-
-/**
- * normalizeAndValidateURL cleans and validates the user-provided URL.
- *
- * It handles common normalization issues, such as:
- * - Missing scheme (defaults to https://).
- * - Malformed schemes caused by some proxies (e.g., http:/example.com -> http://example.com).
- *
- * It also restricts the scheme to 'http' or 'https' to prevent usage of other protocols like 'file://' or 'gopher://'.
- */
-func normalizeAndValidateURL(rawLink string) (*url.URL, error) {
- if rawLink == "" {
- return nil, errors.New("url parameter is empty")
- }
-
- // Fix browser/proxy normalization of :// to :/
- if strings.HasPrefix(rawLink, "http:/") && !strings.HasPrefix(rawLink, "http://") {
- rawLink = "http://" + rawLink[6:]
- } else if strings.HasPrefix(rawLink, "https:/") && !strings.HasPrefix(rawLink, "https://") {
- rawLink = "https://" + rawLink[7:]
- }
-
- // add scheme if missing
- if !strings.Contains(rawLink, "://") {
- // default to https if no scheme provided
- rawLink = fmt.Sprintf("https://%s", rawLink)
- }
- link, err := url.Parse(rawLink)
- if err != nil {
- return nil, fmt.Errorf("invalid URL: %w", err)
- }
- // only allow http(s)
- if link.Scheme != "http" && link.Scheme != "https" {
- return nil, errors.New("unsupported URL scheme")
- }
- return link, nil
-}
-
/**
* securityHeadersMiddleware applies a baseline of security headers to every response.
*
@@ -300,183 +59,6 @@ func Handler(w http.ResponseWriter, r *http.Request) {
securityHeadersMiddleware(http.HandlerFunc(handler)).ServeHTTP(w, r)
}
-/**
- * formatHandler defines the function signature for handling different output formats.
- *
- * Implementations are responsible for:
- * 1. Setting the appropriate Content-Type header.
- * 2. Encoding the article content (HTML, JSON, Markdown, etc.) into the response writer.
- * 3. Handling any encoding errors (logging them, as headers are already written).
- */
-type formatHandler func(w http.ResponseWriter, article readability.Article, buf *bytes.Buffer)
-
-/**
- * formatHTML renders the article using the standard HTML template.
- * This is the default view for human consumption.
- */
-func formatHTML(w http.ResponseWriter, article readability.Article, contentBuf *bytes.Buffer) {
- w.Header().Set("Content-Type", "text/html; charset=utf-8")
- // inject safe HTML content
- data := struct {
- Title string
- Content template.HTML
- }{
- Title: article.Title(),
- Content: template.HTML(contentBuf.String()),
- }
- if err := DefaultTemplate.Execute(w, data); err != nil {
- // at this point, we can't write a JSON error, so we log it
- log.Printf("error executing HTML template: %v", err)
- }
-}
-
-/**
- * formatMarkdown converts the article content to Markdown.
- * Useful for LLMs or note-taking applications.
- */
-func formatMarkdown(w http.ResponseWriter, _ readability.Article, buf *bytes.Buffer) {
- w.Header().Set("Content-Type", "text/markdown")
- if err := godown.Convert(w, buf, nil); err != nil {
- log.Printf("error converting to markdown: %v", err)
- }
-}
-
-/**
- * formatJSON returns the raw title and HTML content in a JSON object.
- * Useful for programmatic consumption where the client wants to handle rendering.
- */
-func formatJSON(w http.ResponseWriter, article readability.Article, buf *bytes.Buffer) {
- w.Header().Set("Content-Type", "application/json")
- if err := json.NewEncoder(w).Encode(map[string]string{
- "title": article.Title(),
- "content": buf.String(),
- }); err != nil {
- log.Printf("error encoding json: %v", err)
- }
-}
-
-/**
- * formatText returns the plain text content, stripped of HTML tags.
- */
-func formatText(w http.ResponseWriter, _ readability.Article, buf *bytes.Buffer) {
- w.Header().Set("Content-Type", "text/plain; charset=utf-8")
- if _, err := w.Write(buf.Bytes()); err != nil {
- log.Printf("error writing text response: %v", err)
- }
-}
-
-/**
- * formatters maps format names (including aliases) to their respective handler functions.
- *
- * This design allows for easy extensibility of output formats. New formats can be
- * added by implementing a formatHandler and registering it here.
- */
-var formatters = map[string]formatHandler{
- "html": formatHTML,
- "md": formatMarkdown,
- "markdown": formatMarkdown,
- "json": formatJSON,
- "text": formatText,
- "txt": formatText,
-}
-
-/**
- * isLLM attempts to detect if the request is originated from a known LLM crawler or tool.
- *
- * It checks the User-Agent string against a list of known identifiers (e.g., GPTBot, Claude).
- * This allows the application to default to a machine-friendly format (Markdown) automatically.
- */
-func isLLM(r *http.Request) bool {
- ua := strings.ToLower(r.UserAgent())
- for _, s := range llmUserAgents {
- if strings.Contains(ua, s) {
- return true
- }
- }
- return false
-}
-
-/**
- * getFormat determines the desired output format based on request signals.
- *
- * Priority order:
- * 1. Query parameter 'format' (explicit override).
- * 2. Accept Header (content negotiation).
- * 3. LLM Detection (auto-switch to Markdown for bots).
- * 4. Default to 'html'.
- */
-func getFormat(r *http.Request) string {
- // 1. Priority: Query parameter
- format := r.URL.Query().Get("format")
- if format != "" {
- return format
- }
-
- // 2. Priority: Accept Header
- accept := strings.ToLower(r.Header.Get("Accept"))
- if strings.Contains(accept, "application/json") {
- return "json"
- }
- if strings.Contains(accept, "text/markdown") || strings.Contains(accept, "text/x-markdown") {
- return "md"
- }
- if strings.Contains(accept, "text/plain") {
- return "text"
- }
- if strings.Contains(accept, "text/html") {
- return "html"
- }
-
- // 3. Priority: LLM Detection (defaults to markdown)
- if isLLM(r) {
- return "md"
- }
-
- return "html"
-}
-
-/**
- * reconstructTargetURL handles query parameter extraction quirks caused by Vercel rewrites.
- *
- * When Vercel rewrites a path like `/api/extract?url=http://example.com?foo=bar`,
- * the `url` query parameter might be cleanly separated from `foo=bar`.
- * This function merges stray query parameters back into the target URL to ensure
- * the full original URL is processed.
- */
-func reconstructTargetURL(r *http.Request) string {
- rawLink := r.URL.Query().Get("url")
- if rawLink == "" {
- return ""
- }
-
- // Reconstruct URL if it was split by query parameters during rewrite
- u, err := url.Parse(rawLink)
- if err != nil {
- return rawLink
- }
-
- targetQuery := u.Query()
- originalQuery := r.URL.Query()
- hasChanges := false
- for k, vs := range originalQuery {
- // Skip 'url' and 'format' as they are control parameters for this API,
- // not part of the target website's query string.
- // Including them would cause recursion or invalid target URLs.
- if k == "url" || k == "format" {
- continue
- }
- hasChanges = true
- for _, v := range vs {
- targetQuery.Add(k, v)
- }
- }
- if hasChanges {
- u.RawQuery = targetQuery.Encode()
- return u.String()
- }
- return rawLink
-}
-
/**
* handler implements the core request processing pipeline.
*
@@ -489,12 +71,12 @@ func reconstructTargetURL(r *http.Request) string {
* 6. Format: Outputs the result in the requested format (HTML, Markdown, JSON, etc.).
*/
func handler(w http.ResponseWriter, r *http.Request) {
- rawLink := reconstructTargetURL(r)
+ rawLink := request.ReconstructURL(r)
- format := getFormat(r)
+ format := request.GetFormat(r)
log.Printf("request: %s %s", format, rawLink)
- link, err := normalizeAndValidateURL(rawLink)
+ link, err := request.NormalizeURL(rawLink)
if err != nil {
log.Printf("error normalizing URL %q: %v", rawLink, err)
writeError(w, http.StatusBadRequest, "Invalid URL provided")
@@ -504,7 +86,8 @@ func handler(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), handlerTimeout)
defer cancel()
- article, err := fetchAndParse(ctx, link, r)
+ safeClient := transport.NewSafeClient()
+ art, err := article.Fetch(ctx, link, r, safeClient)
if err != nil {
log.Printf("error fetching or parsing URL %q: %v", rawLink, err)
writeError(w, http.StatusUnprocessableEntity, "Failed to process URL")
@@ -512,17 +95,16 @@ func handler(w http.ResponseWriter, r *http.Request) {
}
contentBuf := &bytes.Buffer{}
- if err := article.RenderHTML(contentBuf); err != nil {
+ if err := art.RenderHTML(contentBuf); err != nil {
writeError(w, http.StatusInternalServerError, "failed to render article content")
return
}
- formatter, found := formatters[format]
- if !found {
+ if err := formatter.Render(w, art, contentBuf, format); err != nil {
+ log.Printf("error rendering response: %v", err)
writeError(w, http.StatusBadRequest, "invalid format")
return
}
- formatter(w, article, contentBuf)
}
/**
diff --git a/api/index_test.go b/api/index_test.go
index e606bf1..7f817eb 100644
--- a/api/index_test.go
+++ b/api/index_test.go
@@ -1,128 +1,19 @@
package handler
import (
- "context"
"net/http"
"net/http/httptest"
- "net/url"
- "strings"
"testing"
)
-func TestNormalizeAndValidateURL(t *testing.T) {
- tests := []struct {
- raw string
- want string // expected host (with scheme)
- shouldErr bool
- }{
- {"", "", true},
- {"example.com", "https://example.com", false},
- {"http://foo.bar", "http://foo.bar", false},
- {"https:/go.dev/play", "https://go.dev", false},
- {"http:/example.com", "http://example.com", false},
- {"ftp://foo.bar", "", true},
- }
- for _, tt := range tests {
- u, err := normalizeAndValidateURL(tt.raw)
- if tt.shouldErr {
- if err == nil {
- t.Errorf("normalizeAndValidateURL(%q) expected error, got none", tt.raw)
- }
- continue
- }
- if err != nil {
- t.Errorf("normalizeAndValidateURL(%q) unexpected error: %v", tt.raw, err)
- continue
- }
- got := u.Scheme + "://" + u.Host
- if got != tt.want {
- t.Errorf("normalizeAndValidateURL(%q) = %q; want %q", tt.raw, got, tt.want)
- }
- }
-}
-
-func TestFetchAndParse(t *testing.T) {
- // Serve a minimal HTML page
- htmlBody := `Test TitleHello World
`
- srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
- if _, err := w.Write([]byte(htmlBody)); err != nil {
- t.Errorf("failed to write response: %v", err)
- }
- }))
- defer srv.Close()
-
- // Override httpClient to use server's client
- oldClient := httpClient
- httpClient = srv.Client()
- defer func() { httpClient = oldClient }()
-
- u, err := url.Parse(srv.URL)
- if err != nil {
- t.Fatalf("failed to parse server URL: %v", err)
- }
- ctx := context.Background()
- req := httptest.NewRequest("GET", "/", nil)
- art, err := fetchAndParse(ctx, u, req)
- if err != nil {
- t.Fatalf("fetchAndParse returned error: %v", err)
- }
- if art.Title() != "Test Title" {
- t.Errorf("Article.Title() = %q; want %q", art.Title(), "Test Title")
- }
-
- var content strings.Builder
- err = art.RenderHTML(&content)
- if err != nil {
- t.Fatalf("failed to render article content: %v", err)
- }
-
- if !strings.Contains(content.String(), "Hello World") {
- t.Errorf("Article.Content missing expected paragraph, got: %q", content.String())
- }
-}
-
-/**
- * TestSSRFProtection confirms that the custom dialer correctly blocks connections
- * to private and loopback IP addresses.
- *
- * This is a critical security control to prevent the application from being used
- * as a proxy to attack internal infrastructure (SSRF).
- */
-func TestSSRFProtection(t *testing.T) {
- // a dummy server that should never be reached
- srv := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
- t.Fatal("dialer did not block private IP, connection was made")
- }))
- defer srv.Close()
+func TestHandler_InvalidURL(t *testing.T) {
+ req := httptest.NewRequest("GET", "/api?url=", nil)
+ w := httptest.NewRecorder()
- // get loopback address of the server
- // srv.URL will be something like http://127.0.0.1:54321
- // we want to test if the dialer blocks the connection to 127.0.0.1
- // so, we don't use the server's client, we use our own httpClient
- req, err := http.NewRequest("GET", srv.URL, nil)
- if err != nil {
- t.Fatalf("failed to create request: %v", err)
- }
+ Handler(w, req)
- _, err = httpClient.Do(req)
- if err == nil {
- t.Fatal("expected an error when dialing a private IP, but got none")
- }
- // check if the error is the one we expect from our dialer
- // the error is wrapped, so we need to check for the substring
- if !strings.Contains(err.Error(), "refusing to connect to private network address") {
- t.Errorf("expected error to contain 'refusing to connect to private network address', but got: %v", err)
- }
-
- // Test Unspecified IP (0.0.0.0) bypass attempt
- // We manually construct a URL with 0.0.0.0 and a port (it doesn't need to be open for the check to fire)
- unspecifiedURL := "http://0.0.0.0:8080"
- reqUnspecified, _ := http.NewRequest("GET", unspecifiedURL, nil)
- _, err = httpClient.Do(reqUnspecified)
- if err == nil {
- t.Fatal("expected an error when dialing 0.0.0.0, but got none")
- }
- if !strings.Contains(err.Error(), "refusing to connect to private network address") {
- t.Errorf("expected error for 0.0.0.0 to contain 'refusing to connect to private network address', but got: %v", err)
+ resp := w.Result()
+ if resp.StatusCode != http.StatusBadRequest {
+ t.Errorf("Handler() status = %v; want %v", resp.StatusCode, http.StatusBadRequest)
}
}
diff --git a/api/llm_test.go b/api/llm_test.go
deleted file mode 100644
index 1079558..0000000
--- a/api/llm_test.go
+++ /dev/null
@@ -1,59 +0,0 @@
-package handler
-
-import (
- "net/http/httptest"
- "testing"
-)
-
-/**
- * TestIsLLM verifies the detection of Large Language Model (LLM) bots.
- *
- * This ensures that when an LLM (like GPTBot) accesses the service, it
- * automatically receives Markdown content, which is more token-efficient
- * and easier for the model to process than full HTML.
- */
-func TestIsLLM(t *testing.T) {
- tests := []struct {
- ua string
- want bool
- }{
- {"Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)", true},
- {"ChatGPT-User/1.0", true},
- {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", false},
- }
-
- for _, tt := range tests {
- req := httptest.NewRequest("GET", "/", nil)
- req.Header.Set("User-Agent", tt.ua)
- if got := isLLM(req); got != tt.want {
- t.Errorf("isLLM(UA=%q) = %v; want %v", tt.ua, tt.want, got)
- }
- }
-}
-
-func TestGetFormat(t *testing.T) {
- tests := []struct {
- urlStr string
- ua string
- accept string
- want string
- }{
- {"/api?url=...&format=json", "", "", "json"},
- {"/api?url=...", "ChatGPT-User/1.0", "", "md"},
- {"/api?url=...", "Mozilla/5.0", "", "html"},
- {"/api?url=...", "Mozilla/5.0", "application/json", "json"},
- {"/api?url=...", "Mozilla/5.0", "text/markdown", "md"},
- {"/api?url=...", "Mozilla/5.0", "text/plain", "text"},
- // Query param should override Accept
- {"/api?url=...&format=txt", "Mozilla/5.0", "application/json", "txt"},
- }
-
- for _, tt := range tests {
- req := httptest.NewRequest("GET", tt.urlStr, nil)
- req.Header.Set("User-Agent", tt.ua)
- req.Header.Set("Accept", tt.accept)
- if got := getFormat(req); got != tt.want {
- t.Errorf("getFormat(%q, UA=%q, Accept=%q) = %q; want %q", tt.urlStr, tt.ua, tt.accept, got, tt.want)
- }
- }
-}
diff --git a/api/reconstruct_test.go b/api/reconstruct_test.go
deleted file mode 100644
index c510265..0000000
--- a/api/reconstruct_test.go
+++ /dev/null
@@ -1,103 +0,0 @@
-package handler
-
-import (
- "net/http"
- "net/url"
- "reflect"
- "testing"
-)
-
-/**
- * TestReconstructTargetURL verifies the logic for reassembling URLs that have been
- * split by Vercel's rewrite rules.
- *
- * When Vercel rewrites a request like `/api?url=http://example.com?foo=bar`,
- * it parses the query string *before* passing it to the Go handler. This often
- * results in `url=http://example.com` and `foo=bar` being treated as separate
- * parameters, rather than `foo=bar` being part of the `url` value.
- *
- * The reconstruction logic detects these "stray" parameters and merges them
- * back into the target URL to ensure the fetcher requests the correct resource.
- */
-func TestReconstructTargetURL(t *testing.T) {
- tests := []struct {
- name string
- query string
- expected string
- }{
- {
- name: "simple url",
- query: "?url=http://example.com",
- expected: "http://example.com",
- },
- {
- name: "url with encoded params",
- query: "?url=http%3A%2F%2Fexample.com%3Ffoo%3Dbar",
- expected: "http://example.com?foo=bar",
- },
- {
- name: "split params",
- query: "?url=http://example.com&foo=bar&baz=qux",
- expected: "http://example.com?foo=bar&baz=qux",
- },
- {
- name: "split params with existing params",
- query: "?url=http://example.com?a=b&c=d",
- expected: "http://example.com?a=b&c=d",
- },
- {
- name: "mixed params",
- query: "?url=http%3A%2F%2Fexample.com%3Fa%3Db&c=d",
- expected: "http://example.com?a=b&c=d",
- },
- {
- name: "ignore format param",
- query: "?url=http://example.com&format=json&foo=bar",
- expected: "http://example.com?foo=bar",
- },
- {
- name: "empty url",
- query: "?format=json",
- expected: "",
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- u, _ := url.Parse("http://localhost/api" + tt.query)
- r := &http.Request{URL: u}
- got := reconstructTargetURL(r)
-
- if got == "" && tt.expected == "" {
- return
- }
-
- gotU, _ := url.Parse(got)
- expU, _ := url.Parse(tt.expected)
-
- if gotU == nil || expU == nil {
- if got != tt.expected {
- t.Errorf("reconstructTargetURL() = %v, want %v", got, tt.expected)
- }
- return
- }
-
- if gotU.Scheme != expU.Scheme || gotU.Host != expU.Host || gotU.Path != expU.Path {
- t.Errorf("reconstructTargetURL() base mismatch = %v, want %v", got, tt.expected)
- }
-
- gotQ := gotU.Query()
- expQ := expU.Query()
-
- if len(gotQ) != len(expQ) {
- t.Errorf("reconstructTargetURL() query length mismatch = %v, want %v", gotQ, expQ)
- }
-
- for k, v := range expQ {
- if !reflect.DeepEqual(gotQ[k], v) {
- t.Errorf("reconstructTargetURL() param %s mismatch = %v, want %v", k, gotQ[k], v)
- }
- }
- })
- }
-}
diff --git a/internal/article/fetch.go b/internal/article/fetch.go
new file mode 100644
index 0000000..d1699e3
--- /dev/null
+++ b/internal/article/fetch.go
@@ -0,0 +1,100 @@
+package article
+
+import (
+ "context"
+ "io"
+ "math/rand"
+ "net/http"
+ "net/url"
+
+ "codeberg.org/readeck/go-readability/v2"
+ "golang.org/x/net/html"
+)
+
+const maxBodySize = int64(2 * 1024 * 1024) // 2 MiB
+
+/**
+ * ReadabilityParser is the shared instance of the readability parser.
+ *
+ * It is reusable and thread-safe, allowing concurrent processing of multiple
+ * requests without the need to create new parser instances.
+ */
+var ReadabilityParser = readability.NewParser()
+
+/**
+ * userAgentPool contains a list of real browser User-Agent strings.
+ *
+ * We rotate through these to mimic legitimate traffic, as many websites block requests
+ * from default HTTP clients (like Go-http-client) or known bot User-Agents.
+ * This list requires periodic maintenance to stay current with browser versions.
+ */
+var userAgentPool = []string{
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 18_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Mobile/15E148 Safari/604.1",
+}
+
+/**
+ * getRandomUserAgent returns a random User-Agent string from the pool.
+ *
+ * Rotating User-Agents helps to evade simple anti-bot measures that block requests
+ * based on static or default Go HTTP client User-Agents.
+ */
+func getRandomUserAgent() string {
+ return userAgentPool[rand.Intn(len(userAgentPool))]
+}
+
+/**
+ * Fetch retrieves the content from the target URL and parses it using the readability library.
+ *
+ * Key behaviors:
+ * - Spoofs User-Agent and other browser headers to avoid blocking.
+ * - Forwards Accept-Language from the client to respect language preferences.
+ * - Sets security headers (Sec-Fetch-*) to look like a navigation request.
+ * - Limits the response body size to maxBodySize to prevent Out-Of-Memory (OOM) crashes on large pages.
+ * - Uses the provided httpClient which should have SSRF protection configured.
+ */
+func Fetch(ctx context.Context, link *url.URL, r *http.Request, client *http.Client) (readability.Article, error) {
+ req, err := http.NewRequestWithContext(ctx, "GET", link.String(), nil)
+ if err != nil {
+ return readability.Article{}, err
+ }
+
+ // Always spoof everything to look like a real browser
+ ua := getRandomUserAgent()
+ req.Header.Set("User-Agent", ua)
+ req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
+
+ // Fallback headers from client request
+ if lang := r.Header.Get("Accept-Language"); lang != "" {
+ req.Header.Set("Accept-Language", lang)
+ } else {
+ req.Header.Set("Accept-Language", "en-US,en;q=0.9")
+ }
+
+ req.Header.Set("Cache-Control", "no-cache")
+ req.Header.Set("Pragma", "no-cache")
+ req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
+ req.Header.Set("Sec-Fetch-Dest", "document")
+ req.Header.Set("Sec-Fetch-Mode", "navigate")
+ req.Header.Set("Sec-Fetch-Site", "none")
+ req.Header.Set("Sec-Fetch-User", "?1")
+ req.Header.Set("Upgrade-Insecure-Requests", "1")
+
+ res, err := client.Do(req)
+ if err != nil {
+ return readability.Article{}, err
+ }
+ defer res.Body.Close()
+
+ // limit body size to prevent OOM
+ reader := io.LimitReader(res.Body, maxBodySize)
+ node, err := html.Parse(reader)
+ if err != nil {
+ return readability.Article{}, err
+ }
+
+ return ReadabilityParser.ParseDocument(node, link)
+}
diff --git a/internal/article/fetch_test.go b/internal/article/fetch_test.go
new file mode 100644
index 0000000..a2c043b
--- /dev/null
+++ b/internal/article/fetch_test.go
@@ -0,0 +1,47 @@
+package article
+
+import (
+ "context"
+ "net/http"
+ "net/http/httptest"
+ "net/url"
+ "strings"
+ "testing"
+)
+
+func TestFetch(t *testing.T) {
+ // Serve a minimal HTML page
+ htmlBody := `
Test TitleHello World
`
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+ if _, err := w.Write([]byte(htmlBody)); err != nil {
+ t.Errorf("failed to write response: %v", err)
+ }
+ }))
+ defer srv.Close()
+
+ u, err := url.Parse(srv.URL)
+ if err != nil {
+ t.Fatalf("failed to parse server URL: %v", err)
+ }
+ ctx := context.Background()
+ req := httptest.NewRequest("GET", "/", nil)
+
+ // Use server's client which is configured to talk to the test server
+ art, err := Fetch(ctx, u, req, srv.Client())
+ if err != nil {
+ t.Fatalf("Fetch returned error: %v", err)
+ }
+ if art.Title() != "Test Title" {
+ t.Errorf("Article.Title() = %q; want %q", art.Title(), "Test Title")
+ }
+
+ var content strings.Builder
+ err = art.RenderHTML(&content)
+ if err != nil {
+ t.Fatalf("failed to render article content: %v", err)
+ }
+
+ if !strings.Contains(content.String(), "Hello World") {
+ t.Errorf("Article.Content missing expected paragraph, got: %q", content.String())
+ }
+}
diff --git a/internal/formatter/render.go b/internal/formatter/render.go
new file mode 100644
index 0000000..862862c
--- /dev/null
+++ b/internal/formatter/render.go
@@ -0,0 +1,109 @@
+package formatter
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "html/template"
+ "log"
+ "net/http"
+
+ "codeberg.org/readeck/go-readability/v2"
+ "github.com/mattn/godown"
+)
+
+/**
+ * Template is the raw HTML template string used for rendering the article.
+ *
+ * It provides a minimal HTML5 structure and includes the Sakura CSS library
+ * for a clean, typography-focused reading experience without distractions.
+ * The template expects a struct with Title and Content fields.
+ */
+const Template = `
+
+
+
+
+
+
+
+
+
+ {{.Title}}
+ {{.Content}}
+
+
+`
+
+/**
+ * DefaultTemplate is the parsed Go template instance.
+ *
+ * It is initialized at startup to avoid the overhead of parsing the template
+ * on every request, ensuring faster response times.
+ */
+var DefaultTemplate = template.Must(template.New("article").Parse(Template))
+
+/**
+ * formatHandler defines the function signature for handling different output formats.
+ */
+type formatHandler func(w http.ResponseWriter, article readability.Article, buf *bytes.Buffer)
+
+var formatters = map[string]formatHandler{
+ "html": formatHTML,
+ "md": formatMarkdown,
+ "markdown": formatMarkdown,
+ "json": formatJSON,
+ "text": formatText,
+ "txt": formatText,
+}
+
+/**
+ * Render outputs the article in the requested format.
+ */
+func Render(w http.ResponseWriter, article readability.Article, contentBuf *bytes.Buffer, format string) error {
+ formatter, found := formatters[format]
+ if !found {
+ return fmt.Errorf("invalid format: %s", format)
+ }
+ formatter(w, article, contentBuf)
+ return nil
+}
+
+func formatHTML(w http.ResponseWriter, article readability.Article, contentBuf *bytes.Buffer) {
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ // inject safe HTML content
+ data := struct {
+ Title string
+ Content template.HTML
+ }{
+ Title: article.Title(),
+ Content: template.HTML(contentBuf.String()),
+ }
+ if err := DefaultTemplate.Execute(w, data); err != nil {
+ log.Printf("error executing HTML template: %v", err)
+ }
+}
+
+func formatMarkdown(w http.ResponseWriter, _ readability.Article, buf *bytes.Buffer) {
+ w.Header().Set("Content-Type", "text/markdown")
+ if err := godown.Convert(w, buf, nil); err != nil {
+ log.Printf("error converting to markdown: %v", err)
+ }
+}
+
+func formatJSON(w http.ResponseWriter, article readability.Article, buf *bytes.Buffer) {
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(map[string]string{
+ "title": article.Title(),
+ "content": buf.String(),
+ }); err != nil {
+ log.Printf("error encoding json: %v", err)
+ }
+}
+
+func formatText(w http.ResponseWriter, _ readability.Article, buf *bytes.Buffer) {
+ w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+ if _, err := w.Write(buf.Bytes()); err != nil {
+ log.Printf("error writing text response: %v", err)
+ }
+}
diff --git a/internal/request/utils.go b/internal/request/utils.go
new file mode 100644
index 0000000..abbb759
--- /dev/null
+++ b/internal/request/utils.go
@@ -0,0 +1,155 @@
+package request
+
+import (
+ "errors"
+ "fmt"
+ "net/http"
+ "net/url"
+ "strings"
+)
+
+var llmUserAgents = []string{
+ "gptbot",
+ "chatgpt",
+ "claude",
+ "googlebot",
+ "bingbot",
+ "anthropic",
+ "perplexity",
+ "claudebot",
+ "github-copilot",
+}
+
+/**
+ * IsLLM attempts to detect if the request is originated from a known LLM crawler or tool.
+ *
+ * It checks the User-Agent string against a list of known identifiers (e.g., GPTBot, Claude).
+ * This allows the application to default to a machine-friendly format (Markdown) automatically.
+ */
+func IsLLM(r *http.Request) bool {
+ ua := strings.ToLower(r.UserAgent())
+ for _, s := range llmUserAgents {
+ if strings.Contains(ua, s) {
+ return true
+ }
+ }
+ return false
+}
+
+/**
+ * GetFormat determines the desired output format based on request signals.
+ *
+ * Priority order:
+ * 1. Query parameter 'format' (explicit override).
+ * 2. Accept Header (content negotiation).
+ * 3. LLM Detection (auto-switch to Markdown for bots).
+ * 4. Default to 'html'.
+ */
+func GetFormat(r *http.Request) string {
+ // 1. Priority: Query parameter
+ format := r.URL.Query().Get("format")
+ if format != "" {
+ return format
+ }
+
+ // 2. Priority: Accept Header
+ accept := strings.ToLower(r.Header.Get("Accept"))
+ if strings.Contains(accept, "application/json") {
+ return "json"
+ }
+ if strings.Contains(accept, "text/markdown") || strings.Contains(accept, "text/x-markdown") {
+ return "md"
+ }
+ if strings.Contains(accept, "text/plain") {
+ return "text"
+ }
+ if strings.Contains(accept, "text/html") {
+ return "html"
+ }
+
+ // 3. Priority: LLM Detection (defaults to markdown)
+ if IsLLM(r) {
+ return "md"
+ }
+
+ return "html"
+}
+
+/**
+ * ReconstructURL handles query parameter extraction quirks caused by Vercel rewrites.
+ *
+ * When Vercel rewrites a path like `/api/extract?url=http://example.com?foo=bar`,
+ * the `url` query parameter might be cleanly separated from `foo=bar`.
+ * This function merges stray query parameters back into the target URL to ensure
+ * the full original URL is processed.
+ */
+func ReconstructURL(r *http.Request) string {
+ rawLink := r.URL.Query().Get("url")
+ if rawLink == "" {
+ return ""
+ }
+
+ // Reconstruct URL if it was split by query parameters during rewrite
+ u, err := url.Parse(rawLink)
+ if err != nil {
+ return rawLink
+ }
+
+ targetQuery := u.Query()
+ originalQuery := r.URL.Query()
+ hasChanges := false
+ for k, vs := range originalQuery {
+ // Skip 'url' and 'format' as they are control parameters for this API,
+ // not part of the target website's query string.
+ // Including them would cause recursion or invalid target URLs.
+ if k == "url" || k == "format" {
+ continue
+ }
+ hasChanges = true
+ for _, v := range vs {
+ targetQuery.Add(k, v)
+ }
+ }
+ if hasChanges {
+ u.RawQuery = targetQuery.Encode()
+ return u.String()
+ }
+ return rawLink
+}
+
+/**
+ * NormalizeURL cleans and validates the user-provided URL.
+ *
+ * It handles common normalization issues, such as:
+ * - Missing scheme (defaults to https://).
+ * - Malformed schemes caused by some proxies (e.g., http:/example.com -> http://example.com).
+ *
+ * It also restricts the scheme to 'http' or 'https' to prevent usage of other protocols like 'file://' or 'gopher://'.
+ */
+func NormalizeURL(rawLink string) (*url.URL, error) {
+ if rawLink == "" {
+ return nil, errors.New("url parameter is empty")
+ }
+
+ // Fix browser/proxy normalization of :// to :/
+ if strings.HasPrefix(rawLink, "http:/") && !strings.HasPrefix(rawLink, "http://") {
+ rawLink = "http://" + rawLink[6:]
+ } else if strings.HasPrefix(rawLink, "https:/") && !strings.HasPrefix(rawLink, "https://") {
+ rawLink = "https://" + rawLink[7:]
+ }
+
+ // add scheme if missing
+ if !strings.Contains(rawLink, "://") {
+ // default to https if no scheme provided
+ rawLink = fmt.Sprintf("https://%s", rawLink)
+ }
+ link, err := url.Parse(rawLink)
+ if err != nil {
+ return nil, fmt.Errorf("invalid URL: %w", err)
+ }
+ // only allow http(s)
+ if link.Scheme != "http" && link.Scheme != "https" {
+ return nil, errors.New("unsupported URL scheme")
+ }
+ return link, nil
+}
diff --git a/internal/request/utils_test.go b/internal/request/utils_test.go
new file mode 100644
index 0000000..55d4713
--- /dev/null
+++ b/internal/request/utils_test.go
@@ -0,0 +1,170 @@
+package request
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "net/url"
+ "reflect"
+ "testing"
+)
+
+func TestIsLLM(t *testing.T) {
+ tests := []struct {
+ ua string
+ want bool
+ }{
+ {"Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)", true},
+ {"ChatGPT-User/1.0", true},
+ {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", false},
+ }
+
+ for _, tt := range tests {
+ req := httptest.NewRequest("GET", "/", nil)
+ req.Header.Set("User-Agent", tt.ua)
+ if got := IsLLM(req); got != tt.want {
+ t.Errorf("IsLLM(UA=%q) = %v; want %v", tt.ua, tt.want, got)
+ }
+ }
+}
+
+func TestGetFormat(t *testing.T) {
+ tests := []struct {
+ urlStr string
+ ua string
+ accept string
+ want string
+ }{
+ {"/api?url=...&format=json", "", "", "json"},
+ {"/api?url=...", "ChatGPT-User/1.0", "", "md"},
+ {"/api?url=...", "Mozilla/5.0", "", "html"},
+ {"/api?url=...", "Mozilla/5.0", "application/json", "json"},
+ {"/api?url=...", "Mozilla/5.0", "text/markdown", "md"},
+ {"/api?url=...", "Mozilla/5.0", "text/plain", "text"},
+ // Query param should override Accept
+ {"/api?url=...&format=txt", "Mozilla/5.0", "application/json", "txt"},
+ }
+
+ for _, tt := range tests {
+ req := httptest.NewRequest("GET", tt.urlStr, nil)
+ req.Header.Set("User-Agent", tt.ua)
+ req.Header.Set("Accept", tt.accept)
+ if got := GetFormat(req); got != tt.want {
+ t.Errorf("GetFormat(%q, UA=%q, Accept=%q) = %q; want %q", tt.urlStr, tt.ua, tt.accept, got, tt.want)
+ }
+ }
+}
+
+func TestReconstructTargetURL(t *testing.T) {
+ tests := []struct {
+ name string
+ query string
+ expected string
+ }{
+ {
+ name: "simple url",
+ query: "?url=http://example.com",
+ expected: "http://example.com",
+ },
+ {
+ name: "url with encoded params",
+ query: "?url=http%3A%2F%2Fexample.com%3Ffoo%3Dbar",
+ expected: "http://example.com?foo=bar",
+ },
+ {
+ name: "split params",
+ query: "?url=http://example.com&foo=bar&baz=qux",
+ expected: "http://example.com?foo=bar&baz=qux",
+ },
+ {
+ name: "split params with existing params",
+ query: "?url=http://example.com?a=b&c=d",
+ expected: "http://example.com?a=b&c=d",
+ },
+ {
+ name: "mixed params",
+ query: "?url=http%3A%2F%2Fexample.com%3Fa%3Db&c=d",
+ expected: "http://example.com?a=b&c=d",
+ },
+ {
+ name: "ignore format param",
+ query: "?url=http://example.com&format=json&foo=bar",
+ expected: "http://example.com?foo=bar",
+ },
+ {
+ name: "empty url",
+ query: "?format=json",
+ expected: "",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ u, _ := url.Parse("http://localhost/api" + tt.query)
+ r := &http.Request{URL: u}
+ got := ReconstructURL(r)
+
+ if got == "" && tt.expected == "" {
+ return
+ }
+
+ gotU, _ := url.Parse(got)
+ expU, _ := url.Parse(tt.expected)
+
+ if gotU == nil || expU == nil {
+ if got != tt.expected {
+ t.Errorf("ReconstructURL() = %v, want %v", got, tt.expected)
+ }
+ return
+ }
+
+ if gotU.Scheme != expU.Scheme || gotU.Host != expU.Host || gotU.Path != expU.Path {
+ t.Errorf("ReconstructURL() base mismatch = %v, want %v", got, tt.expected)
+ }
+
+ gotQ := gotU.Query()
+ expQ := expU.Query()
+
+ if len(gotQ) != len(expQ) {
+ t.Errorf("ReconstructURL() query length mismatch = %v, want %v", gotQ, expQ)
+ }
+
+ for k, v := range expQ {
+ if !reflect.DeepEqual(gotQ[k], v) {
+ t.Errorf("ReconstructURL() param %s mismatch = %v, want %v", k, gotQ[k], v)
+ }
+ }
+ })
+ }
+}
+
+func TestNormalizeURL(t *testing.T) {
+ tests := []struct {
+ raw string
+ want string // expected host (with scheme)
+ shouldErr bool
+ }{
+ {"", "", true},
+ {"example.com", "https://example.com", false},
+ {"http://foo.bar", "http://foo.bar", false},
+ {"https:/go.dev/play", "https://go.dev", false},
+ {"http:/example.com", "http://example.com", false},
+ {"ftp://foo.bar", "", true},
+ }
+ for _, tt := range tests {
+ u, err := NormalizeURL(tt.raw)
+ if tt.shouldErr {
+ if err == nil {
+ t.Errorf("NormalizeURL(%q) expected error, got none", tt.raw)
+ }
+ continue
+ }
+ if err != nil {
+ t.Errorf("NormalizeURL(%q) unexpected error: %v", tt.raw, err)
+ continue
+ }
+ got := u.Scheme + "://" + u.Host
+ if got != tt.want {
+ t.Errorf("NormalizeURL(%q) = %q; want %q", tt.raw, got, tt.want)
+ }
+ }
+}
diff --git a/internal/transport/client.go b/internal/transport/client.go
new file mode 100644
index 0000000..db5f909
--- /dev/null
+++ b/internal/transport/client.go
@@ -0,0 +1,64 @@
+package transport
+
+import (
+ "errors"
+ "fmt"
+ "net"
+ "net/http"
+ "syscall"
+ "time"
+)
+
+const (
+ maxRedirects = 5
+ httpClientTimeout = 10 * time.Second
+ dialerTimeout = 30 * time.Second
+ dialerKeepAlive = 30 * time.Second
+)
+
+/**
+ * NewSafeClient creates a custom http.Client that prevents Server-Side Request Forgery (SSRF).
+ *
+ * It uses a custom dialer that validates the resolved IP address before connecting, ensuring that it is not:
+ * - A private network address (e.g., 192.168.x.x, 10.x.x.x)
+ * - A loopback address (e.g., 127.0.0.1)
+ * - An unspecified address (e.g., 0.0.0.0)
+ */
+func NewSafeClient() *http.Client {
+ return &http.Client{
+ Transport: &http.Transport{
+ DialContext: newSafeDialer().DialContext,
+ },
+ Timeout: httpClientTimeout,
+ CheckRedirect: func(_ *http.Request, via []*http.Request) error {
+ if len(via) >= maxRedirects {
+ return fmt.Errorf("stopped after %d redirects", maxRedirects)
+ }
+ return nil
+ },
+ }
+}
+
+func newSafeDialer() *net.Dialer {
+ dialer := &net.Dialer{
+ Timeout: dialerTimeout,
+ KeepAlive: dialerKeepAlive,
+ Control: func(_, address string, _ syscall.RawConn) error {
+ host, _, err := net.SplitHostPort(address)
+ if err != nil {
+ return err
+ }
+ ips, err := net.LookupIP(host)
+ if err != nil {
+ return err
+ }
+ for _, ip := range ips {
+ if ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() || ip.IsUnspecified() {
+ return errors.New("refusing to connect to private network address")
+ }
+ }
+ return nil
+ },
+ }
+ return dialer
+}
diff --git a/internal/transport/client_test.go b/internal/transport/client_test.go
new file mode 100644
index 0000000..4906ee5
--- /dev/null
+++ b/internal/transport/client_test.go
@@ -0,0 +1,54 @@
+package transport
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+)
+
+/**
+ * TestSSRFProtection confirms that the custom dialer correctly blocks connections
+ * to private and loopback IP addresses.
+ *
+ * This is a critical security control to prevent the application from being used
+ * as a proxy to attack internal infrastructure (SSRF).
+ */
+func TestSSRFProtection(t *testing.T) {
+ client := NewSafeClient()
+ // a dummy server that should never be reached
+ srv := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
+ t.Fatal("dialer did not block private IP, connection was made")
+ }))
+ defer srv.Close()
+
+ // get loopback address of the server
+ // srv.URL will be something like http://127.0.0.1:54321
+ // we want to test if the dialer blocks the connection to 127.0.0.1
+ req, err := http.NewRequest("GET", srv.URL, nil)
+ if err != nil {
+ t.Fatalf("failed to create request: %v", err)
+ }
+
+ _, err = client.Do(req)
+ if err == nil {
+ t.Fatal("expected an error when dialing a private IP, but got none")
+ }
+ // check if the error is the one we expect from our dialer
+ // the error is wrapped, so we need to check for the substring
+ if !strings.Contains(err.Error(), "refusing to connect to private network address") {
+ t.Errorf("expected error to contain 'refusing to connect to private network address', but got: %v", err)
+ }
+
+ // Test Unspecified IP (0.0.0.0) bypass attempt
+ // We manually construct a URL with 0.0.0.0 and a port (it doesn't need to be open for the check to fire)
+ unspecifiedURL := "http://0.0.0.0:8080"
+ reqUnspecified, _ := http.NewRequest("GET", unspecifiedURL, nil)
+ _, err = client.Do(reqUnspecified)
+ if err == nil {
+ t.Fatal("expected an error when dialing 0.0.0.0, but got none")
+ }
+ if !strings.Contains(err.Error(), "refusing to connect to private network address") {
+ t.Errorf("expected error for 0.0.0.0 to contain 'refusing to connect to private network address', but got: %v", err)
+ }
+}