Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ Attaches the interceptor to the provided Axios instance.
interface RobotsPluginOptions {
userAgent: string;
crawlDelayCompliance?: CrawlDelayComplianceMode; // default: CrawlDelayComplianceMode.Await
cachingPolicy?: CachingPolicy; // default: Indefinite (caches forever)
}

enum CrawlDelayComplianceMode {
Expand All @@ -92,6 +93,25 @@ enum CrawlDelayComplianceMode {
}
```

### `CachingPolicy`

You can configure how long `robots.txt` is cached.

```typescript
import { CachingPolicyType } from 'axios-robots';

// Option 1: Indefinite Caching (Default)
const indefinite = {
type: CachingPolicyType.Indefinite
};

// Option 2: Time-based Expiration
const timeBased = {
type: CachingPolicyType.ExpireAfter,
duration: '1h' // Supports strings ('5m', '1d', '200ms') or numbers (milliseconds)
};
```

### Error Handling

The interceptor throws a `RobotsError` in the following cases:
Expand Down Expand Up @@ -120,10 +140,10 @@ The interceptor throws a `RobotsError` in the following cases:
- [x] **Standard Directives**: Supports `User-agent`, `Allow`, and `Disallow`.
- [x] **Wildcards**: Supports standard path matching including `*` and `$`.
- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured.
- [x] **Cache TTL**: Flexible caching policies (indefinite or expiration-based).

### 🚧 Roadmap
- [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer.
- [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance.

## Contributing

Expand Down
11 changes: 9 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "axios-robots",
"version": "0.2.0",
"version": "0.3.0",
"description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots",
"main": "dist/index.js",
"types": "dist/index.d.ts",
Expand Down Expand Up @@ -33,14 +33,21 @@
"url": "https://github.com/hyperfluid-tech/axios-robots/issues"
},
"homepage": "https://github.com/hyperfluid-tech/axios-robots#readme",
"peerDependencies": {
"axios": "^1.0.0"
},
"devDependencies": {
"@types/jest": "^30.0.0",
"@types/ms": "^2.1.0",
"@types/node": "^25.0.6",
"axios": "^1.13.2",
"jest": "^30.2.0",
"nock": "^14.0.10",
"robots-parser": "^3.0.1",
"ts-jest": "^29.4.6",
"typescript": "^5.9.3"
},
"dependencies": {
"ms": "^2.1.3",
"robots-parser": "^3.0.1"
}
}
16 changes: 14 additions & 2 deletions src/data/repositories/RobotsDataRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,31 @@ import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT }
import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError';
import { IRobotsDataRepository } from '../../domain/interfaces/IRobotsDataRepository';
import { CachedRobot } from '../../domain/models/CachedRobot';
import { RobotsPluginOptions } from '../../domain/models/RobotsPluginOptions';
import { CachingPolicy } from '../../domain/models/CachingPolicy';
import { CachingStrategyFactory } from '../../domain/strategies/caching/CachingStrategyFactory';
import { CachingPolicyType } from '../../domain/models/CachingPolicyType';

export class RobotsDataRepository implements IRobotsDataRepository {
private cache: Map<string, CachedRobot> = new Map();
private cachingPolicy: CachingPolicy;
private strategyFactory: CachingStrategyFactory;

constructor(options?: RobotsPluginOptions) {
this.cachingPolicy = options?.cachingPolicy ?? { type: CachingPolicyType.Indefinite };
this.strategyFactory = new CachingStrategyFactory();
}

async getRobot(url: string, userAgent: string = '*'): Promise<CachedRobot> {
const origin = new URL(url).origin;
let cached = this.cache.get(origin);

if (cached)
if (cached && this.strategyFactory.getStrategy(this.cachingPolicy).isValid(cached)) {
return cached;
}

const robot = await this.fetchRobotsTxt(origin, userAgent);
cached = { robot };
cached = { robot, fetchedAt: Date.now() };
this.cache.set(origin, cached);

return cached;
Expand Down
4 changes: 4 additions & 0 deletions src/domain/models/CachedRobot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ export interface CachedRobot {
* Timestamp of the last crawl for this domain.
*/
lastCrawled?: number;
/**
* Timestamp of when the robots.txt was fetched.
*/
fetchedAt: number;
}
15 changes: 15 additions & 0 deletions src/domain/models/CachingPolicy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { CachingPolicyType } from './CachingPolicyType';

export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy;

export interface IndefiniteCachingPolicy {
type: CachingPolicyType.Indefinite;
}

export interface ExpireAfterCachingPolicy {
type: CachingPolicyType.ExpireAfter;
/**
* Duration in milliseconds or a string format supported by the 'ms' library (e.g., '1h', '5m').
*/
duration: string | number;
}
13 changes: 13 additions & 0 deletions src/domain/models/CachingPolicyType.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/**
* Types of caching policies for robots.txt data.
*/
export enum CachingPolicyType {
/**
* Cache robots.txt data indefinitely.
*/
Indefinite = 'indefinitely',
/**
* Cache robots.txt data for a specific duration.
*/
ExpireAfter = 'expireAfter'
}
6 changes: 6 additions & 0 deletions src/domain/models/RobotsPluginOptions.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { CrawlDelayComplianceMode } from './CrawlDelayComplianceMode';
import { CachingPolicy } from './CachingPolicy';

export interface RobotsPluginOptions {
/**
Expand All @@ -10,4 +11,9 @@ export interface RobotsPluginOptions {
* Defaults to CrawlDelayComplianceMode.Await
*/
crawlDelayCompliance?: CrawlDelayComplianceMode;
/**
* How to handle caching of robots.txt data.
* Defaults to indefinitely.
*/
cachingPolicy?: CachingPolicy;
}
2 changes: 1 addition & 1 deletion src/domain/services/CrawlDelayService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';
import { ICrawlDelayService } from '../interfaces/ICrawlDelayService';
import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { CrawlDelayStrategyFactory } from '../strategies/CrawlDelayStrategyFactory';
import { CrawlDelayStrategyFactory } from '../strategies/crawl-delay/CrawlDelayStrategyFactory';

export class CrawlDelayService implements ICrawlDelayService {
private calculateWaitTimeUseCase: CalculateWaitTimeUseCase;
Expand Down
18 changes: 18 additions & 0 deletions src/domain/strategies/caching/CachingStrategyFactory.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import { CachingPolicy } from '../../models/CachingPolicy';
import { CachingPolicyType } from '../../models/CachingPolicyType';
import { ICachingStrategy } from './ICachingStrategy';
import { IndefiniteCachingStrategy } from './IndefiniteCachingStrategy';
import { ExpireAfterCachingStrategy } from './ExpireAfterCachingStrategy';

export class CachingStrategyFactory {
getStrategy(policy: CachingPolicy): ICachingStrategy {
switch (policy.type) {
case CachingPolicyType.Indefinite:
return new IndefiniteCachingStrategy();
case CachingPolicyType.ExpireAfter:
return new ExpireAfterCachingStrategy(policy.duration);
default:
return new IndefiniteCachingStrategy();
}
}
}
21 changes: 21 additions & 0 deletions src/domain/strategies/caching/ExpireAfterCachingStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import ms from 'ms';
import { ICachingStrategy } from './ICachingStrategy';
import { CachedRobot } from '../../models/CachedRobot';

export class ExpireAfterCachingStrategy implements ICachingStrategy {
private durationMs: number;

constructor(duration: string | number) {
if (typeof duration === 'string') {
this.durationMs = ms(duration as any) as unknown as number;
return;
}
this.durationMs = duration;
}

isValid(cached: CachedRobot): boolean {
const now = Date.now();
const expirationTime = cached.fetchedAt + this.durationMs;
return now < expirationTime;
}
}
13 changes: 13 additions & 0 deletions src/domain/strategies/caching/ICachingStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import { CachedRobot } from '../../models/CachedRobot';

/**
* Strategy interface for validating cached robots.txt data.
*/
export interface ICachingStrategy {
/**
* Determines whether the cached robot data is still valid.
* @param cached The cached robot data to validate.
* @returns True if the cache is valid, false if specific data should be refreshed.
*/
isValid(cached: CachedRobot): boolean;
}
8 changes: 8 additions & 0 deletions src/domain/strategies/caching/IndefiniteCachingStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { ICachingStrategy } from './ICachingStrategy';
import { CachedRobot } from '../../models/CachedRobot';

export class IndefiniteCachingStrategy implements ICachingStrategy {
isValid(cached: CachedRobot): boolean {
return true;
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { CalculateWaitTimeUseCase } from '../../usecases/CalculateWaitTimeUseCase';

export class AwaitCrawlDelayStrategy implements ICrawlDelayStrategy {
constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';
import { CrawlDelayComplianceMode } from '../../models/CrawlDelayComplianceMode';
import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { CalculateWaitTimeUseCase } from '../../usecases/CalculateWaitTimeUseCase';
import { AwaitCrawlDelayStrategy } from './AwaitCrawlDelayStrategy';
import { FailureCrawlDelayStrategy } from './FailureCrawlDelayStrategy';
import { IgnoreCrawlDelayStrategy } from './IgnoreCrawlDelayStrategy';
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

import { ICrawlDelayStrategy } from './ICrawlDelayStrategy';
import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase';
import { CrawlDelayError } from '../../errors/CrawlDelayError';
import { CalculateWaitTimeUseCase } from '../../usecases/CalculateWaitTimeUseCase';
import { CrawlDelayError } from '../../../errors/CrawlDelayError';

export class FailureCrawlDelayStrategy implements ICrawlDelayStrategy {
constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { }
Expand Down
6 changes: 6 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ export * from './domain/interfaces/IRobotsDataRepository';
export * from './domain/interfaces/IAllowService';
export * from './domain/interfaces/ICrawlDelayService';

export * from './domain/models/CachingPolicy';
export * from './domain/models/CachingPolicyType';
export * from './domain/strategies/caching/ICachingStrategy';
export * from './domain/strategies/caching/IndefiniteCachingStrategy';
export * from './domain/strategies/caching/ExpireAfterCachingStrategy';
export * from './domain/strategies/caching/CachingStrategyFactory';
/**
* Apply the robots exclusion protocol interceptor to an Axios instance.
* @param axiosInstance The axios instance to apply the interceptor to
Expand Down
2 changes: 1 addition & 1 deletion src/interceptor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export class RobotsInterceptor {
this.userAgent = options.userAgent;
this.crawlDelayCompliance = options.crawlDelayCompliance ?? CrawlDelayComplianceMode.Await;

this.dataService = deps?.dataService ?? new RobotsDataRepository();
this.dataService = deps?.dataService ?? new RobotsDataRepository(options);
this.allowService = deps?.allowService ?? new AllowService(this.dataService);
this.crawlDelayService = deps?.crawlDelayService ?? new CrawlDelayService(this.dataService);
}
Expand Down
Loading