Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Ensures your bot plays by the rules defined by website owners, preventing unauth
## Features

- **🚀 Automated Compliance**: Validates every request against `robots.txt` rules (cached per origin).
- **⏱️ Crawl-Delay**: Option to automatically wait before requests if `Crawl-delay` is specified.
- **🛡️ Strict Mode**: invalid URLs, non-HTTP/S protocols, or unreachable `robots.txt` files (non-4xx error) block requests by default.
- **✨ Clean Architecture**: built with maintainability and separation of concerns in mind.
- **🔌 Plug-and-Play**: easily attaches to any Axios instance.
Expand Down Expand Up @@ -43,7 +44,7 @@ const client = axios.create();

// Apply the interceptor
applyRobotsInterceptor(client, {
userAgent: 'MyCoolBot/1.0'
userAgent: 'MyCoolBot/1.0',
});

async function crawl() {
Expand Down Expand Up @@ -81,6 +82,13 @@ Attaches the interceptor to the provided Axios instance.
```typescript
interface RobotsPluginOptions {
userAgent: string;
crawlDelayCompliance?: CrawlDelayComplianceMode; // default: CrawlDelayComplianceMode.Await
}

enum CrawlDelayComplianceMode {
Await = 'await', // Respects delay by waiting
Ignore = 'ignore', // Ignores delay
Failure = 'failure' // Throws Error if delay is not met
}
```

Expand Down Expand Up @@ -111,9 +119,9 @@ The interceptor throws a `RobotsError` in the following cases:
- [x] **[RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html) Compliance**: Full support for the standard Robots Exclusion Protocol.
- [x] **Standard Directives**: Supports `User-agent`, `Allow`, and `Disallow`.
- [x] **Wildcards**: Supports standard path matching including `*` and `$`.
- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured.

### 🚧 Missing / TODO
- [ ] **Crawl-delay**: The interceptor currently does **not** enforce `Crawl-delay` directives (automatic throttling).
- [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer.
- [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance.

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "axios-robots",
"version": "0.1.0",
"version": "0.2.0",
"description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots",
"main": "dist/index.js",
"types": "dist/index.d.ts",
Expand Down
57 changes: 57 additions & 0 deletions src/data/repositories/RobotsDataRepository.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import robotsParser, { Robot } from 'robots-parser';
import axios from 'axios';
import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants';
import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError';
import { IRobotsDataRepository } from '../../domain/interfaces/IRobotsDataRepository';
import { CachedRobot } from '../../domain/models/CachedRobot';

export class RobotsDataRepository implements IRobotsDataRepository {
private cache: Map<string, CachedRobot> = new Map();

async getRobot(url: string, userAgent: string = '*'): Promise<CachedRobot> {
const origin = new URL(url).origin;
let cached = this.cache.get(origin);

if (cached)
return cached;

const robot = await this.fetchRobotsTxt(origin, userAgent);
cached = { robot };
this.cache.set(origin, cached);

return cached;
}

setLastCrawled(url: string, timestamp: number): void {
const origin = new URL(url).origin;
const cached = this.cache.get(origin);
if (cached) {
cached.lastCrawled = timestamp;
}
}

private async fetchRobotsTxt(origin: string, userAgent: string): Promise<Robot> {
const robotsUrl = `${origin}/${ROBOTS_TXT_FILENAME}`;

const internalClient = axios.create({
headers: {
[HEADER_USER_AGENT]: userAgent,
}
});

try {
const response = await internalClient.get(robotsUrl);
return robotsParser(robotsUrl, response.data);
} catch (error: any) {
if (this.isUnavailable(error)) {
return robotsParser(robotsUrl, ALLOW_ALL_ROBOTS_TXT_CONTENT);
}

throw new RobotsUnreachableError(error.message);
}
}

private isUnavailable(error: any): boolean {
return error.response && error.response.status >= 400 && error.response.status < 500;
}
}
52 changes: 0 additions & 52 deletions src/domain/RobotsService.ts

This file was deleted.

12 changes: 12 additions & 0 deletions src/domain/interfaces/IAllowService.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/**
* Service for checking if a URL is allowed to be crawled according to robots.txt rules.
*/
export interface IAllowService {
/**
* Checks if the given URL is allowed for the specified user agent.
* @param url The URL to check.
* @param userAgent The user agent to check against.
* @returns A promise resolving to true if allowed, false otherwise.
*/
isAllowed(url: string, userAgent?: string): Promise<boolean>;
}
15 changes: 15 additions & 0 deletions src/domain/interfaces/ICrawlDelayService.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';

/**
* Service for handling Crawl-delay directives from robots.txt.
*/
export interface ICrawlDelayService {
/**
* Enforces the crawl delay for a given URL based on the compliance mode.
* @param url The URL about to be requested.
* @param userAgent The user agent to check rules for.
* @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure).
* @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode).
*/
handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise<void>;
}
21 changes: 21 additions & 0 deletions src/domain/interfaces/IRobotsDataRepository.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { CachedRobot } from '../models/CachedRobot';

/**
* Repository for managing robots.txt data and crawl timestamps independently of the protocol logic.
*/
export interface IRobotsDataRepository {
/**
* Retrieves the cached robot instance for a given URL.
* @param url The URL to get the robot for (used to extract the domain/origin).
* @param userAgent Optional user agent to use for fetching robots.txt if not cached.
* @returns A promise resolving to the CachedRobot containing the parsed rules.
*/
getRobot(url: string, userAgent?: string): Promise<CachedRobot>;

/**
* Updates the last crawled timestamp for the domain associated with the URL.
* @param url The URL identifying the domain.
* @param timestamp The timestamp to set.
*/
setLastCrawled(url: string, timestamp: number): void;
}
12 changes: 12 additions & 0 deletions src/domain/models/CachedRobot.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { Robot } from 'robots-parser';

export interface CachedRobot {
/**
* The parsed robots.txt object.
*/
robot: Robot;
/**
* Timestamp of the last crawl for this domain.
*/
lastCrawled?: number;
}
17 changes: 17 additions & 0 deletions src/domain/models/CrawlDelayComplianceMode.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/**
* Options for the Robots Exclusion Protocol plugin.
*/
export enum CrawlDelayComplianceMode {
/**
* Respects the Crawl-delay directive by waiting before making the request.
*/
Await = 'await',
/**
* Ignores the Crawl-delay directive.
*/
Ignore = 'ignore',
/**
* Throws an error if the request violates the Crawl-delay.
*/
Failure = 'failure'
}
13 changes: 13 additions & 0 deletions src/domain/models/RobotsPluginOptions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import { CrawlDelayComplianceMode } from './CrawlDelayComplianceMode';

export interface RobotsPluginOptions {
/**
* The User-Agent string to use when checking robots.txt rules.
*/
userAgent: string;
/**
* How to handle Crawl-delay directives.
* Defaults to CrawlDelayComplianceMode.Await
*/
crawlDelayCompliance?: CrawlDelayComplianceMode;
}
16 changes: 16 additions & 0 deletions src/domain/services/AllowService.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { IAllowService } from '../interfaces/IAllowService';
import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository';

export class AllowService implements IAllowService {
constructor(private dataService: IRobotsDataRepository) { }

async isAllowed(url: string, userAgent: string = '*'): Promise<boolean> {
const robot = await this.dataService.getRobot(url, userAgent);

if (!robot || !robot.robot) {
return true;
}

return robot.robot.isAllowed(url, userAgent) ?? true;
}
}
24 changes: 24 additions & 0 deletions src/domain/services/CrawlDelayService.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';
import { ICrawlDelayService } from '../interfaces/ICrawlDelayService';
import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { CrawlDelayStrategyFactory } from '../strategies/CrawlDelayStrategyFactory';

export class CrawlDelayService implements ICrawlDelayService {
private calculateWaitTimeUseCase: CalculateWaitTimeUseCase;
private strategyFactory: CrawlDelayStrategyFactory;

constructor(private dataService: IRobotsDataRepository) {
this.calculateWaitTimeUseCase = new CalculateWaitTimeUseCase(dataService);
this.strategyFactory = new CrawlDelayStrategyFactory(this.calculateWaitTimeUseCase);
}

async handleCrawlDelay(
url: string,
userAgent: string,
complianceMode: CrawlDelayComplianceMode
): Promise<void> {
const strategy = this.strategyFactory.getStrategy(complianceMode);
await strategy.execute(url, userAgent);
}
}
17 changes: 17 additions & 0 deletions src/domain/strategies/AwaitCrawlDelayStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';

export class AwaitCrawlDelayStrategy implements ICrawlDelayStrategy {
constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { }

async execute(url: string, userAgent: string): Promise<void> {
const { waitTime } = await this.calculateWaitTimeUseCase.execute(url, userAgent);

if (waitTime <= 0)
return;


await new Promise(resolve => setTimeout(resolve, waitTime));
}
}
22 changes: 22 additions & 0 deletions src/domain/strategies/CrawlDelayStrategyFactory.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';
import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { AwaitCrawlDelayStrategy } from './AwaitCrawlDelayStrategy';
import { FailureCrawlDelayStrategy } from './FailureCrawlDelayStrategy';
import { IgnoreCrawlDelayStrategy } from './IgnoreCrawlDelayStrategy';

export class CrawlDelayStrategyFactory {
constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { }

getStrategy(mode: CrawlDelayComplianceMode): ICrawlDelayStrategy {
switch (mode) {
case CrawlDelayComplianceMode.Failure:
return new FailureCrawlDelayStrategy(this.calculateWaitTimeUseCase);
case CrawlDelayComplianceMode.Ignore:
return new IgnoreCrawlDelayStrategy();
case CrawlDelayComplianceMode.Await:
return new AwaitCrawlDelayStrategy(this.calculateWaitTimeUseCase);
}
}
}
16 changes: 16 additions & 0 deletions src/domain/strategies/FailureCrawlDelayStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { CrawlDelayError } from '../../errors/CrawlDelayError';

export class FailureCrawlDelayStrategy implements ICrawlDelayStrategy {
constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { }

async execute(url: string, userAgent: string): Promise<void> {
const { waitTime, delay } = await this.calculateWaitTimeUseCase.execute(url, userAgent);

if (waitTime <= 0) return;

throw new CrawlDelayError(delay);
}
}
12 changes: 12 additions & 0 deletions src/domain/strategies/ICrawlDelayStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

/**
* Strategy for ensuring compliance with Crawl-delay rules.
*/
export interface ICrawlDelayStrategy {
/**
* Executes the strategy for a given URL and user agent.
* @param url The URL about to be crawled.
* @param userAgent The user agent for which to check the rules.
*/
execute(url: string, userAgent: string): Promise<void>;
}
8 changes: 8 additions & 0 deletions src/domain/strategies/IgnoreCrawlDelayStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';

export class IgnoreCrawlDelayStrategy implements ICrawlDelayStrategy {
async execute(url: string, userAgent: string): Promise<void> {
return;
}
}
Loading