-
Notifications
You must be signed in to change notification settings - Fork 113
chore: edge cache headers + agent-aware robots + bot-block middleware #119
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| # Humans + agents welcome. Training crawlers + SEO scrapers blocked. | ||
|
|
||
| User-agent: Googlebot | ||
| Allow: / | ||
|
|
||
| User-agent: Bingbot | ||
| Allow: / | ||
|
|
||
| User-agent: DuckDuckBot | ||
| Allow: / | ||
|
|
||
| User-agent: Applebot | ||
| Allow: / | ||
|
|
||
| User-agent: ChatGPT-User | ||
| Allow: / | ||
|
|
||
| User-agent: OAI-SearchBot | ||
| Allow: / | ||
|
|
||
| User-agent: PerplexityBot | ||
| Allow: / | ||
|
|
||
| User-agent: Perplexity-User | ||
| Allow: / | ||
|
|
||
| User-agent: Claude-User | ||
| Allow: / | ||
|
|
||
| User-agent: Claude-SearchBot | ||
| Allow: / | ||
|
|
||
| User-agent: FirecrawlAgent | ||
| Allow: / | ||
|
|
||
| User-agent: firecrawl | ||
| Allow: / | ||
|
|
||
| User-agent: Context7Bot | ||
| Allow: / | ||
|
|
||
| User-agent: Crawl4AI | ||
| Allow: / | ||
|
|
||
| User-agent: Clawdbot | ||
| Allow: / | ||
|
|
||
| User-agent: OpenClaw | ||
| Allow: / | ||
|
|
||
| User-agent: Hermes | ||
| Allow: / | ||
|
|
||
| User-agent: GPTBot | ||
| Disallow: / | ||
|
|
||
| User-agent: ClaudeBot | ||
| Disallow: / | ||
|
|
||
| User-agent: anthropic-ai | ||
| Disallow: / | ||
|
|
||
| User-agent: CCBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Google-Extended | ||
| Disallow: / | ||
|
|
||
| User-agent: Applebot-Extended | ||
| Disallow: / | ||
|
|
||
| User-agent: Bytespider | ||
| Disallow: / | ||
|
|
||
| User-agent: Amazonbot | ||
| Disallow: / | ||
|
|
||
| User-agent: FacebookBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Meta-ExternalAgent | ||
| Disallow: / | ||
|
|
||
| User-agent: cohere-ai | ||
| Disallow: / | ||
|
|
||
| User-agent: Diffbot | ||
| Disallow: / | ||
|
|
||
| User-agent: ImagesiftBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Omgilibot | ||
| Disallow: / | ||
|
|
||
| User-agent: peer39_crawler | ||
| Disallow: / | ||
|
|
||
| User-agent: YouBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Timpibot | ||
| Disallow: / | ||
|
|
||
| User-agent: ICC-Crawler | ||
| Disallow: / | ||
|
|
||
| User-agent: AhrefsBot | ||
| Disallow: / | ||
|
|
||
| User-agent: SemrushBot | ||
| Disallow: / | ||
|
|
||
| User-agent: MJ12bot | ||
| Disallow: / | ||
|
|
||
| User-agent: DotBot | ||
| Disallow: / | ||
|
|
||
| User-agent: PetalBot | ||
| Disallow: / | ||
|
|
||
| User-agent: BLEXBot | ||
| Disallow: / | ||
|
|
||
| User-agent: MegaIndex | ||
| Disallow: / | ||
|
|
||
| User-agent: SeznamBot | ||
| Disallow: / | ||
|
|
||
| User-agent: DataForSeoBot | ||
| Disallow: / | ||
|
|
||
| User-agent: * | ||
| Allow: / | ||
|
|
||
| Sitemap: https://skillkit-docs.vercel.app/sitemap.xml |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| import { NextResponse } from 'next/server'; | ||
| import type { NextRequest } from 'next/server'; | ||
|
|
||
| const BLOCK = /GPTBot|ClaudeBot|anthropic-ai|CCBot|Google-Extended|Applebot-Extended|Bytespider|Amazonbot|Meta-ExternalAgent|cohere-ai|Diffbot|ImagesiftBot|Omgilibot|peer39_crawler|YouBot|Timpibot|ICC-Crawler|AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot|BLEXBot|MegaIndex|SeznamBot|DataForSeoBot/i; | ||
|
|
||
| const ALLOW = /Googlebot|Bingbot|DuckDuckBot|Applebot(?!-Extended)|ChatGPT-User|OAI-SearchBot|PerplexityBot|Perplexity-User|Claude-User|Claude-SearchBot|FirecrawlAgent|firecrawl|Context7Bot|Crawl4AI|Clawdbot|OpenClaw|Hermes/i; | ||
|
|
||
| export function middleware(req: NextRequest) { | ||
| const ua = req.headers.get('user-agent') || ''; | ||
| if (ALLOW.test(ua)) return NextResponse.next(); | ||
| if (BLOCK.test(ua)) { | ||
|
Comment on lines
+4
to
+11
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Blocklist/enforcement mismatch and precedence bug in UA checks.
Suggested fix-const BLOCK = /GPTBot|ClaudeBot|anthropic-ai|CCBot|Google-Extended|Applebot-Extended|Bytespider|Amazonbot|Meta-ExternalAgent|cohere-ai|Diffbot|ImagesiftBot|Omgilibot|peer39_crawler|YouBot|Timpibot|ICC-Crawler|AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot|BLEXBot|MegaIndex|SeznamBot|DataForSeoBot/i;
+const BLOCK = /GPTBot|ClaudeBot|anthropic-ai|CCBot|Google-Extended|Applebot-Extended|Bytespider|Amazonbot|FacebookBot|Meta-ExternalAgent|cohere-ai|Diffbot|ImagesiftBot|Omgilibot|peer39_crawler|YouBot|Timpibot|ICC-Crawler|AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot|BLEXBot|MegaIndex|SeznamBot|DataForSeoBot/i;
export function middleware(req: NextRequest) {
const ua = req.headers.get('user-agent') || '';
- if (ALLOW.test(ua)) return NextResponse.next();
if (BLOCK.test(ua)) {
return new NextResponse('disallowed by robots.txt', {
status: 403,
headers: { 'Cache-Control': 'public, max-age=86400' },
});
}
+ if (ALLOW.test(ua)) return NextResponse.next();
return NextResponse.next();
}🤖 Prompt for AI Agents |
||
| return new NextResponse('disallowed by robots.txt', { | ||
| status: 403, | ||
| headers: { 'Cache-Control': 'public, max-age=86400' }, | ||
| }); | ||
| } | ||
| return NextResponse.next(); | ||
| } | ||
|
|
||
| export const config = { | ||
| matcher: '/((?!_next/static|_next/image|favicon|robots\\.txt|sitemap\\.xml).*)', | ||
| }; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| # Humans + agents welcome. Training crawlers + SEO scrapers blocked. | ||
|
|
||
| User-agent: Googlebot | ||
| Allow: / | ||
|
|
||
| User-agent: Bingbot | ||
| Allow: / | ||
|
|
||
| User-agent: DuckDuckBot | ||
| Allow: / | ||
|
|
||
| User-agent: Applebot | ||
| Allow: / | ||
|
|
||
| User-agent: ChatGPT-User | ||
| Allow: / | ||
|
|
||
| User-agent: OAI-SearchBot | ||
| Allow: / | ||
|
|
||
| User-agent: PerplexityBot | ||
| Allow: / | ||
|
|
||
| User-agent: Perplexity-User | ||
| Allow: / | ||
|
|
||
| User-agent: Claude-User | ||
| Allow: / | ||
|
|
||
| User-agent: Claude-SearchBot | ||
| Allow: / | ||
|
|
||
| User-agent: FirecrawlAgent | ||
| Allow: / | ||
|
|
||
| User-agent: firecrawl | ||
| Allow: / | ||
|
|
||
| User-agent: Context7Bot | ||
| Allow: / | ||
|
|
||
| User-agent: Crawl4AI | ||
| Allow: / | ||
|
|
||
| User-agent: Clawdbot | ||
| Allow: / | ||
|
|
||
| User-agent: OpenClaw | ||
| Allow: / | ||
|
|
||
| User-agent: Hermes | ||
| Allow: / | ||
|
|
||
| User-agent: GPTBot | ||
| Disallow: / | ||
|
|
||
| User-agent: ClaudeBot | ||
| Disallow: / | ||
|
|
||
| User-agent: anthropic-ai | ||
| Disallow: / | ||
|
|
||
| User-agent: CCBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Google-Extended | ||
| Disallow: / | ||
|
|
||
| User-agent: Applebot-Extended | ||
| Disallow: / | ||
|
|
||
| User-agent: Bytespider | ||
| Disallow: / | ||
|
|
||
| User-agent: Amazonbot | ||
| Disallow: / | ||
|
|
||
| User-agent: FacebookBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Meta-ExternalAgent | ||
| Disallow: / | ||
|
|
||
| User-agent: cohere-ai | ||
| Disallow: / | ||
|
|
||
| User-agent: Diffbot | ||
| Disallow: / | ||
|
|
||
| User-agent: ImagesiftBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Omgilibot | ||
| Disallow: / | ||
|
|
||
| User-agent: peer39_crawler | ||
| Disallow: / | ||
|
|
||
| User-agent: YouBot | ||
| Disallow: / | ||
|
|
||
| User-agent: Timpibot | ||
| Disallow: / | ||
|
|
||
| User-agent: ICC-Crawler | ||
| Disallow: / | ||
|
|
||
| User-agent: AhrefsBot | ||
| Disallow: / | ||
|
|
||
| User-agent: SemrushBot | ||
| Disallow: / | ||
|
|
||
| User-agent: MJ12bot | ||
| Disallow: / | ||
|
|
||
| User-agent: DotBot | ||
| Disallow: / | ||
|
|
||
| User-agent: PetalBot | ||
| Disallow: / | ||
|
|
||
| User-agent: BLEXBot | ||
| Disallow: / | ||
|
|
||
| User-agent: MegaIndex | ||
| Disallow: / | ||
|
|
||
| User-agent: SeznamBot | ||
| Disallow: / | ||
|
|
||
| User-agent: DataForSeoBot | ||
| Disallow: / | ||
|
|
||
| User-agent: * | ||
| Allow: / | ||
|
|
||
| Sitemap: https://skillkit.dev/sitemap.xml |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -21,5 +21,47 @@ | |||||||||||||||||||||||||||||||||||||
| "source": "/docs", | ||||||||||||||||||||||||||||||||||||||
| "destination": "https://skillkit-docs.vercel.app/docs" | ||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||
| ], | ||||||||||||||||||||||||||||||||||||||
| "headers": [ | ||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||
| "source": "/(.*)\\.(css|js|png|jpg|jpeg|svg|webp|woff2|woff|ttf|ico|mp4)", | ||||||||||||||||||||||||||||||||||||||
| "headers": [ | ||||||||||||||||||||||||||||||||||||||
| { "key": "Cache-Control", "value": "public, max-age=86400, s-maxage=604800, stale-while-revalidate=2592000" } | ||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||
| "source": "/assets/(.*)", | ||||||||||||||||||||||||||||||||||||||
| "headers": [ | ||||||||||||||||||||||||||||||||||||||
| { "key": "Cache-Control", "value": "public, max-age=31536000, immutable" } | ||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||
| "source": "/(.*)\\.html", | ||||||||||||||||||||||||||||||||||||||
| "headers": [ | ||||||||||||||||||||||||||||||||||||||
| { "key": "Cache-Control", "value": "public, max-age=300, s-maxage=86400, stale-while-revalidate=604800" } | ||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||
| "source": "/", | ||||||||||||||||||||||||||||||||||||||
| "headers": [ | ||||||||||||||||||||||||||||||||||||||
| { "key": "Cache-Control", "value": "public, max-age=300, s-maxage=86400, stale-while-revalidate=604800" } | ||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||
| "source": "/api", | ||||||||||||||||||||||||||||||||||||||
| "headers": [ | ||||||||||||||||||||||||||||||||||||||
| { "key": "Cache-Control", "value": "public, max-age=300, s-maxage=86400, stale-while-revalidate=604800" } | ||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||
| ], | ||||||||||||||||||||||||||||||||||||||
| "redirects": [ | ||||||||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||||||||
| "source": "/((?!robots\\.txt$).*)", | ||||||||||||||||||||||||||||||||||||||
| "has": [ | ||||||||||||||||||||||||||||||||||||||
| { "type": "header", "key": "user-agent", "value": "(?i).*(GPTBot|ClaudeBot|anthropic-ai|CCBot|Google-Extended|Applebot-Extended|Bytespider|Amazonbot|Meta-ExternalAgent|cohere-ai|Diffbot|ImagesiftBot|Omgilibot|peer39_crawler|YouBot|Timpibot|ICC-Crawler|AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot|BLEXBot|MegaIndex|SeznamBot|DataForSeoBot).*" } | ||||||||||||||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 FacebookBot missing from vercel.json redirect user-agent pattern despite being in robots.txt Disallow list The
Suggested change
Was this helpful? React with 👍 or 👎 to provide feedback. |
||||||||||||||||||||||||||||||||||||||
| ], | ||||||||||||||||||||||||||||||||||||||
| "destination": "/robots.txt", | ||||||||||||||||||||||||||||||||||||||
| "permanent": false | ||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+57
to
+65
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Line 61 omits Suggested fix- { "type": "header", "key": "user-agent", "value": "(?i).*(GPTBot|ClaudeBot|anthropic-ai|CCBot|Google-Extended|Applebot-Extended|Bytespider|Amazonbot|Meta-ExternalAgent|cohere-ai|Diffbot|ImagesiftBot|Omgilibot|peer39_crawler|YouBot|Timpibot|ICC-Crawler|AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot|BLEXBot|MegaIndex|SeznamBot|DataForSeoBot).*" }
+ { "type": "header", "key": "user-agent", "value": "(?i).*(GPTBot|ClaudeBot|anthropic-ai|CCBot|Google-Extended|Applebot-Extended|Bytespider|Amazonbot|FacebookBot|Meta-ExternalAgent|cohere-ai|Diffbot|ImagesiftBot|Omgilibot|peer39_crawler|YouBot|Timpibot|ICC-Crawler|AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot|BLEXBot|MegaIndex|SeznamBot|DataForSeoBot).*" }📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 FacebookBot missing from middleware BLOCK regex despite being in robots.txt Disallow list
The
robots.txtatdocs/fumadocs/public/robots.txt:78-79explicitly disallowsFacebookBot, but theBLOCKregex in the middleware omits it. This meansFacebookBotwill pass through the middleware (falling through to the defaultNextResponse.next()at line 17) and serve content normally, undermining the intended bot-blocking enforcement.Was this helpful? React with 👍 or 👎 to provide feedback.