Skip to content

Commit c060fca

Browse files
authored
Crawl-delay compliance (#1)
* Add support for crawl delay * Add crawl delay mode * Add new compliance mode Separate errors * split RobotsService into smaller services * move services * Add crawl delay strategies * move files around * remove redundant comments * rename service to repository * bump version * refactor tests * add unit tests
1 parent 9d75276 commit c060fca

36 files changed

Lines changed: 1001 additions & 103 deletions

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Ensures your bot plays by the rules defined by website owners, preventing unauth
1212
## Features
1313

1414
- **🚀 Automated Compliance**: Validates every request against `robots.txt` rules (cached per origin).
15+
- **⏱️ Crawl-Delay**: Option to automatically wait before requests if `Crawl-delay` is specified.
1516
- **🛡️ Strict Mode**: invalid URLs, non-HTTP/S protocols, or unreachable `robots.txt` files (non-4xx error) block requests by default.
1617
- **✨ Clean Architecture**: built with maintainability and separation of concerns in mind.
1718
- **🔌 Plug-and-Play**: easily attaches to any Axios instance.
@@ -43,7 +44,7 @@ const client = axios.create();
4344

4445
// Apply the interceptor
4546
applyRobotsInterceptor(client, {
46-
userAgent: 'MyCoolBot/1.0'
47+
userAgent: 'MyCoolBot/1.0',
4748
});
4849

4950
async function crawl() {
@@ -81,6 +82,13 @@ Attaches the interceptor to the provided Axios instance.
8182
```typescript
8283
interface RobotsPluginOptions {
8384
userAgent: string;
85+
crawlDelayCompliance?: CrawlDelayComplianceMode; // default: CrawlDelayComplianceMode.Await
86+
}
87+
88+
enum CrawlDelayComplianceMode {
89+
Await = 'await', // Respects delay by waiting
90+
Ignore = 'ignore', // Ignores delay
91+
Failure = 'failure' // Throws Error if delay is not met
8492
}
8593
```
8694

@@ -111,9 +119,9 @@ The interceptor throws a `RobotsError` in the following cases:
111119
- [x] **[RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html) Compliance**: Full support for the standard Robots Exclusion Protocol.
112120
- [x] **Standard Directives**: Supports `User-agent`, `Allow`, and `Disallow`.
113121
- [x] **Wildcards**: Supports standard path matching including `*` and `$`.
122+
- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured.
114123

115124
### 🚧 Missing / TODO
116-
- [ ] **Crawl-delay**: The interceptor currently does **not** enforce `Crawl-delay` directives (automatic throttling).
117125
- [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer.
118126
- [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance.
119127

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "axios-robots",
3-
"version": "0.1.0",
3+
"version": "0.2.0",
44
"description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots",
55
"main": "dist/index.js",
66
"types": "dist/index.d.ts",
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import robotsParser, { Robot } from 'robots-parser';
2+
import axios from 'axios';
3+
import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants';
4+
import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError';
5+
import { IRobotsDataRepository } from '../../domain/interfaces/IRobotsDataRepository';
6+
import { CachedRobot } from '../../domain/models/CachedRobot';
7+
8+
export class RobotsDataRepository implements IRobotsDataRepository {
9+
private cache: Map<string, CachedRobot> = new Map();
10+
11+
async getRobot(url: string, userAgent: string = '*'): Promise<CachedRobot> {
12+
const origin = new URL(url).origin;
13+
let cached = this.cache.get(origin);
14+
15+
if (cached)
16+
return cached;
17+
18+
const robot = await this.fetchRobotsTxt(origin, userAgent);
19+
cached = { robot };
20+
this.cache.set(origin, cached);
21+
22+
return cached;
23+
}
24+
25+
setLastCrawled(url: string, timestamp: number): void {
26+
const origin = new URL(url).origin;
27+
const cached = this.cache.get(origin);
28+
if (cached) {
29+
cached.lastCrawled = timestamp;
30+
}
31+
}
32+
33+
private async fetchRobotsTxt(origin: string, userAgent: string): Promise<Robot> {
34+
const robotsUrl = `${origin}/${ROBOTS_TXT_FILENAME}`;
35+
36+
const internalClient = axios.create({
37+
headers: {
38+
[HEADER_USER_AGENT]: userAgent,
39+
}
40+
});
41+
42+
try {
43+
const response = await internalClient.get(robotsUrl);
44+
return robotsParser(robotsUrl, response.data);
45+
} catch (error: any) {
46+
if (this.isUnavailable(error)) {
47+
return robotsParser(robotsUrl, ALLOW_ALL_ROBOTS_TXT_CONTENT);
48+
}
49+
50+
throw new RobotsUnreachableError(error.message);
51+
}
52+
}
53+
54+
private isUnavailable(error: any): boolean {
55+
return error.response && error.response.status >= 400 && error.response.status < 500;
56+
}
57+
}

src/domain/RobotsService.ts

Lines changed: 0 additions & 52 deletions
This file was deleted.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
/**
2+
* Service for checking if a URL is allowed to be crawled according to robots.txt rules.
3+
*/
4+
export interface IAllowService {
5+
/**
6+
* Checks if the given URL is allowed for the specified user agent.
7+
* @param url The URL to check.
8+
* @param userAgent The user agent to check against.
9+
* @returns A promise resolving to true if allowed, false otherwise.
10+
*/
11+
isAllowed(url: string, userAgent?: string): Promise<boolean>;
12+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';
2+
3+
/**
4+
* Service for handling Crawl-delay directives from robots.txt.
5+
*/
6+
export interface ICrawlDelayService {
7+
/**
8+
* Enforces the crawl delay for a given URL based on the compliance mode.
9+
* @param url The URL about to be requested.
10+
* @param userAgent The user agent to check rules for.
11+
* @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure).
12+
* @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode).
13+
*/
14+
handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise<void>;
15+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import { CachedRobot } from '../models/CachedRobot';
2+
3+
/**
4+
* Repository for managing robots.txt data and crawl timestamps independently of the protocol logic.
5+
*/
6+
export interface IRobotsDataRepository {
7+
/**
8+
* Retrieves the cached robot instance for a given URL.
9+
* @param url The URL to get the robot for (used to extract the domain/origin).
10+
* @param userAgent Optional user agent to use for fetching robots.txt if not cached.
11+
* @returns A promise resolving to the CachedRobot containing the parsed rules.
12+
*/
13+
getRobot(url: string, userAgent?: string): Promise<CachedRobot>;
14+
15+
/**
16+
* Updates the last crawled timestamp for the domain associated with the URL.
17+
* @param url The URL identifying the domain.
18+
* @param timestamp The timestamp to set.
19+
*/
20+
setLastCrawled(url: string, timestamp: number): void;
21+
}

src/domain/models/CachedRobot.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { Robot } from 'robots-parser';
2+
3+
export interface CachedRobot {
4+
/**
5+
* The parsed robots.txt object.
6+
*/
7+
robot: Robot;
8+
/**
9+
* Timestamp of the last crawl for this domain.
10+
*/
11+
lastCrawled?: number;
12+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/**
2+
* Options for the Robots Exclusion Protocol plugin.
3+
*/
4+
export enum CrawlDelayComplianceMode {
5+
/**
6+
* Respects the Crawl-delay directive by waiting before making the request.
7+
*/
8+
Await = 'await',
9+
/**
10+
* Ignores the Crawl-delay directive.
11+
*/
12+
Ignore = 'ignore',
13+
/**
14+
* Throws an error if the request violates the Crawl-delay.
15+
*/
16+
Failure = 'failure'
17+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { CrawlDelayComplianceMode } from './CrawlDelayComplianceMode';
2+
3+
export interface RobotsPluginOptions {
4+
/**
5+
* The User-Agent string to use when checking robots.txt rules.
6+
*/
7+
userAgent: string;
8+
/**
9+
* How to handle Crawl-delay directives.
10+
* Defaults to CrawlDelayComplianceMode.Await
11+
*/
12+
crawlDelayCompliance?: CrawlDelayComplianceMode;
13+
}

0 commit comments

Comments
 (0)