Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ const timeBased = {
type: CachingPolicyType.ExpireAfter,
duration: '1h' // Supports strings ('5m', '1d', '200ms') or numbers (milliseconds)
};

// Option 3: Request-based Expiration
const requestBased = {
type: CachingPolicyType.RequestCount,
maxRequests: 10 // Expire after 10 requests
};
```

### Error Handling
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "axios-robots",
"version": "0.3.0",
"version": "0.4.0",
"description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots",
"main": "dist/index.js",
"types": "dist/index.d.ts",
Expand Down
37 changes: 33 additions & 4 deletions src/data/repositories/RobotsDataRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,52 @@ export class RobotsDataRepository implements IRobotsDataRepository {
const origin = new URL(url).origin;
let cached = this.cache.get(origin);

if (cached && this.strategyFactory.getStrategy(this.cachingPolicy).isValid(cached)) {
const strategy = this.strategyFactory.getStrategy(this.cachingPolicy);
const isValid = cached && strategy.isValid(cached);

if (cached && isValid) {
return cached;
}

const robot = await this.fetchRobotsTxt(origin, userAgent);
cached = { robot, fetchedAt: Date.now() };
const previousLastCrawled = cached?.lastCrawled;

cached = {
robot,
fetchedAt: Date.now(),
usageCount: 0
};

if (previousLastCrawled) {
cached.lastCrawled = previousLastCrawled;
}

this.cache.set(origin, cached);

return cached;
}

getCachedRobot(url: string): CachedRobot | undefined {
const origin = new URL(url).origin;
return this.cache.get(origin);
}

incrementUsage(url: string): void {
const origin = new URL(url).origin;
const cached = this.cache.get(origin);
if (!cached) {
return;
}
cached.usageCount = (cached.usageCount || 0) + 1;
}

setLastCrawled(url: string, timestamp: number): void {
const origin = new URL(url).origin;
const cached = this.cache.get(origin);
if (cached) {
cached.lastCrawled = timestamp;
if (!cached) {
return;
}
cached.lastCrawled = timestamp;
}

private async fetchRobotsTxt(origin: string, userAgent: string): Promise<Robot> {
Expand Down
12 changes: 12 additions & 0 deletions src/domain/interfaces/IRobotsDataRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@ export interface IRobotsDataRepository {
*/
getRobot(url: string, userAgent?: string): Promise<CachedRobot>;

/**
* Retrieves the robot from cache if available, without fetching or validating against strategy.
* @param url The URL to retrieve the robot for.
*/
getCachedRobot(url: string): CachedRobot | undefined;

/**
* Increments the usage count for the cached robot associated with the URL.
* @param url The URL identifying the domain.
*/
incrementUsage(url: string): void;

/**
* Updates the last crawled timestamp for the domain associated with the URL.
* @param url The URL identifying the domain.
Expand Down
4 changes: 4 additions & 0 deletions src/domain/models/CachedRobot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,8 @@ export interface CachedRobot {
* Timestamp of when the robots.txt was fetched.
*/
fetchedAt: number;
/**
* Number of times this cached robot has been accessed.
*/
usageCount?: number;
}
11 changes: 10 additions & 1 deletion src/domain/models/CachingPolicy.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { CachingPolicyType } from './CachingPolicyType';

export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy;
export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy | RequestCountCachingPolicy;

export interface IndefiniteCachingPolicy {
type: CachingPolicyType.Indefinite;
Expand All @@ -13,3 +13,12 @@ export interface ExpireAfterCachingPolicy {
*/
duration: string | number;
}

export interface RequestCountCachingPolicy {
type: CachingPolicyType.RequestCount;
/**
* Maximum number of requests before the cache expires.
*/
maxRequests: number;
}

6 changes: 5 additions & 1 deletion src/domain/models/CachingPolicyType.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@ export enum CachingPolicyType {
/**
* Cache robots.txt data for a specific duration.
*/
ExpireAfter = 'expireAfter'
ExpireAfter = 'expireAfter',
/**
* Cache robots.txt data for a specific number of requests.
*/
RequestCount = 'requestCount'
}
1 change: 1 addition & 0 deletions src/domain/services/AllowService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export class AllowService implements IAllowService {

async isAllowed(url: string, userAgent: string = '*'): Promise<boolean> {
const robot = await this.dataService.getRobot(url, userAgent);
this.dataService.incrementUsage(url);

if (!robot || !robot.robot) {
return true;
Expand Down
3 changes: 3 additions & 0 deletions src/domain/strategies/caching/CachingStrategyFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { CachingPolicyType } from '../../models/CachingPolicyType';
import { ICachingStrategy } from './ICachingStrategy';
import { IndefiniteCachingStrategy } from './IndefiniteCachingStrategy';
import { ExpireAfterCachingStrategy } from './ExpireAfterCachingStrategy';
import { RequestCountCachingStrategy } from './RequestCountCachingStrategy';

export class CachingStrategyFactory {
getStrategy(policy: CachingPolicy): ICachingStrategy {
Expand All @@ -11,6 +12,8 @@ export class CachingStrategyFactory {
return new IndefiniteCachingStrategy();
case CachingPolicyType.ExpireAfter:
return new ExpireAfterCachingStrategy(policy.duration);
case CachingPolicyType.RequestCount:
return new RequestCountCachingStrategy(policy.maxRequests);
default:
return new IndefiniteCachingStrategy();
}
Expand Down
14 changes: 14 additions & 0 deletions src/domain/strategies/caching/RequestCountCachingStrategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import { ICachingStrategy } from './ICachingStrategy';
import { CachedRobot } from '../../models/CachedRobot';

export class RequestCountCachingStrategy implements ICachingStrategy {
private maxRequests: number;

constructor(maxRequests: number) {
this.maxRequests = maxRequests;
}

isValid(cached: CachedRobot): boolean {
return (cached.usageCount || 0) < this.maxRequests;
}
}
6 changes: 5 additions & 1 deletion src/domain/usecases/CalculateWaitTimeUseCase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ export class CalculateWaitTimeUseCase {
constructor(private dataService: IRobotsDataRepository) { }

async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> {
const cachedRobot = await this.dataService.getRobot(url, userAgent);
let cachedRobot = this.dataService.getCachedRobot(url);

if (!cachedRobot) {
cachedRobot = await this.dataService.getRobot(url, userAgent);
}

if (!cachedRobot || !cachedRobot.robot) {
return { waitTime: 0, delay: 0 };
Expand Down
4 changes: 2 additions & 2 deletions src/interceptor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ export class RobotsInterceptor {
* Intercepts Axios requests to enforce the Robots Exclusion Protocol.
*/
public async intercept(config: InternalAxiosRequestConfig): Promise<InternalAxiosRequestConfig> {
if (!config.url) {
if (!config.url && !config.baseURL) {
return config;
}

Expand Down Expand Up @@ -108,7 +108,7 @@ export class RobotsInterceptor {
return new URL(config.url || '', config.baseURL);
}

return new URL(config.url || '');
return new URL(config.url as string);
} catch (e: any) {
throw new InvalidUrlError(e.message);
}
Expand Down
63 changes: 63 additions & 0 deletions tests/integration/caching.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,69 @@ describe('Caching Policy Integration', () => {

await client.get(`${DOMAIN}/second`);

expect(robotsScope.isDone()).toBe(true);
});
test(`
GIVEN a requestCount caching policy of 2 requests
WHEN a third request is made
THEN robots.txt should be fetched again
`, async () => {
const initialTime = 1672531200000;
jest.spyOn(Date, 'now').mockReturnValue(initialTime);

client = axios.create();
applyRobotsInterceptor(client, {
userAgent: USER_AGENT,
cachingPolicy: {
type: CachingPolicyType.RequestCount,
maxRequests: 2
}
});

const robotsScope = nock(DOMAIN)
.get('/robots.txt')
.times(2)
.reply(200, `User-agent: *\nAllow: /`);

nock(DOMAIN).get('/first').reply(200, 'OK');
nock(DOMAIN).get('/second').reply(200, 'OK');
nock(DOMAIN).get('/third').reply(200, 'OK');

await client.get(`${DOMAIN}/first`);
await client.get(`${DOMAIN}/second`);
await client.get(`${DOMAIN}/third`);

expect(robotsScope.isDone()).toBe(true);
});

test(`
GIVEN a requestCount caching policy of 2 requests
WHEN a second request is made
THEN robots.txt should NOT be fetched again
`, async () => {
const initialTime = 1672531200000;
jest.spyOn(Date, 'now').mockReturnValue(initialTime);

client = axios.create();
applyRobotsInterceptor(client, {
userAgent: USER_AGENT,
cachingPolicy: {
type: CachingPolicyType.RequestCount,
maxRequests: 2
}
});

const robotsScope = nock(DOMAIN)
.get('/robots.txt')
.times(1)
.reply(200, `User-agent: *\nAllow: /`);

nock(DOMAIN).get('/first').reply(200, 'OK');
nock(DOMAIN).get('/second').reply(200, 'OK');

await client.get(`${DOMAIN}/first`);
await client.get(`${DOMAIN}/second`);

expect(robotsScope.isDone()).toBe(true);
});
});
70 changes: 70 additions & 0 deletions tests/integration/interceptor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,76 @@ THEN it should send the configured User-Agent header

expect(response.status).toBe(200);
});

test(`
GIVEN a baseURL and a relative URL
WHEN a request is made
THEN it should resolve the full URL correctly
`, async () => {
nock(DOMAIN)
.get('/robots.txt')
.reply(200, `User-agent: *\nAllow: /relative`);

nock(DOMAIN)
.get('/relative')
.reply(200, 'OK');

client.defaults.baseURL = DOMAIN;
const response = await client.get('/relative');

expect(response.status).toBe(200);
});

test(`
GIVEN a request with no URL
WHEN the interceptor runs
THEN it should return the config as-is
`, async () => {
const interceptor = (client.interceptors.request as any).handlers[0].fulfilled;
const config = { headers: {} };
const result = await interceptor(config);
expect(result).toBe(config);
});

test(`
GIVEN a response with no config
WHEN the response interceptor runs
THEN it should return the response as-is
`, () => {
const interceptor = (client.interceptors.response as any).handlers[0].fulfilled;
const response = { data: 'ok' };
const result = interceptor(response);
expect(result).toBe(response);
});

test(`
GIVEN a config with no headers
WHEN the interceptor runs
THEN it should not throw and should proceed
`, async () => {
const interceptor = (client.interceptors.request as any).handlers[0].fulfilled;
const config = { url: 'https://example.com' }; // No headers
const result = await interceptor(config);
// Should not set header, but return config
expect(result).toBe(config);
expect(config).not.toHaveProperty('headers');
});

test(`
GIVEN a baseURL and an empty URL
WHEN a request is made
THEN it should resolve using just the baseURL
`, async () => {
const interceptor = (client.interceptors.request as any).handlers[0].fulfilled;
const config = { baseURL: DOMAIN, url: '' };

// Setup mock for the robots.txt request that the interceptor will trigger
const scope = nock(DOMAIN).get('/robots.txt').reply(200, 'User-agent: *\nAllow: /');

await interceptor(config);

expect(scope.isDone()).toBe(true);
});
});

describe('Caching', () => {
Expand Down
Loading