From 4eb8d3a47a287d3ec3219203929d42236dfdbab5 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:33:50 +0000 Subject: [PATCH 01/12] Add support for crawl delay --- README.md | 7 ++- src/domain/RobotsService.ts | 42 +++++++++++--- src/index.ts | 4 ++ src/interceptor.ts | 51 ++++++++++++++++- src/types.ts | 10 ++++ tests/crawl-delay.test.ts | 107 ++++++++++++++++++++++++++++++++++++ 6 files changed, 211 insertions(+), 10 deletions(-) create mode 100644 tests/crawl-delay.test.ts diff --git a/README.md b/README.md index 9288beb..0c591aa 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Ensures your bot plays by the rules defined by website owners, preventing unauth ## Features - **🚀 Automated Compliance**: Validates every request against `robots.txt` rules (cached per origin). +- **⏱️ Crawl-Delay**: Option to automatically wait before requests if `Crawl-delay` is specified. - **🛡️ Strict Mode**: invalid URLs, non-HTTP/S protocols, or unreachable `robots.txt` files (non-4xx error) block requests by default. - **✨ Clean Architecture**: built with maintainability and separation of concerns in mind. - **🔌 Plug-and-Play**: easily attaches to any Axios instance. @@ -43,7 +44,8 @@ const client = axios.create(); // Apply the interceptor applyRobotsInterceptor(client, { - userAgent: 'MyCoolBot/1.0' + userAgent: 'MyCoolBot/1.0', + complyWithCrawlDelay: true // default is true }); async function crawl() { @@ -81,6 +83,7 @@ Attaches the interceptor to the provided Axios instance. ```typescript interface RobotsPluginOptions { userAgent: string; + complyWithCrawlDelay?: boolean; // default: true } ``` @@ -113,7 +116,7 @@ The interceptor throws a `RobotsError` in the following cases: - [x] **Wildcards**: Supports standard path matching including `*` and `$`. ### 🚧 Missing / TODO -- [ ] **Crawl-delay**: The interceptor currently does **not** enforce `Crawl-delay` directives (automatic throttling). +- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured. - [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer. - [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance. diff --git a/src/domain/RobotsService.ts b/src/domain/RobotsService.ts index fbbeb24..b646251 100644 --- a/src/domain/RobotsService.ts +++ b/src/domain/RobotsService.ts @@ -4,25 +4,53 @@ import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } import { RobotsError } from '../errors/RobotsError'; import { ERROR_MESSAGES } from '../errors/messages'; -import { IRobotsService } from '../types'; +import { IRobotsService, CachedRobot } from '../types'; export class RobotsService implements IRobotsService { - private cache: Map = new Map(); + private cache: Map = new Map(); /** * Checks if the given URL is allowed for the specified User-Agent. * Fetching and caching the robots.txt is handled automatically. */ async isAllowed(url: string, userAgent: string = '*'): Promise { - const origin = new URL(url).origin; - let robot = this.cache.get(origin); + const robot = await this.getRobot(url, userAgent); if (!robot) { - robot = await this.fetchRobotsTxt(origin, userAgent); - this.cache.set(origin, robot); + // Should not happen as getRobot handles fetching, but safety check + return true; } - return robot.isAllowed(url, userAgent) ?? true; + return robot.robot.isAllowed(url, userAgent) ?? true; + } + + /** + * Retrieves the cached robot rules for the given URL's origin. + * Fetches from the network if not already cached. + */ + async getRobot(url: string, userAgent: string = '*'): Promise { + const origin = new URL(url).origin; + let cached = this.cache.get(origin); + + if (cached) + return cached; + + const robot = await this.fetchRobotsTxt(origin, userAgent); + cached = { robot }; + this.cache.set(origin, cached); + + return cached; + } + + /** + * Updates the last crawled timestamp for the given URL's origin. + */ + setLastCrawled(url: string, timestamp: number): void { + const origin = new URL(url).origin; + const cached = this.cache.get(origin); + if (cached) { + cached.lastCrawled = timestamp; + } } private async fetchRobotsTxt(origin: string, userAgent: string): Promise { diff --git a/src/index.ts b/src/index.ts index 1752e8d..bfb7494 100644 --- a/src/index.ts +++ b/src/index.ts @@ -15,4 +15,8 @@ export * from './types'; export function applyRobotsInterceptor(axiosInstance: AxiosInstance, options: RobotsPluginOptions): void { const interceptor = new RobotsInterceptor(options); axiosInstance.interceptors.request.use((config) => interceptor.intercept(config)); + axiosInstance.interceptors.response.use( + (response) => interceptor.interceptResponse(response), + (error) => interceptor.interceptResponseError(error) + ); } diff --git a/src/interceptor.ts b/src/interceptor.ts index b2c484e..c26f5a1 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,4 +1,4 @@ -import { InternalAxiosRequestConfig } from 'axios'; +import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; import { IRobotsService, RobotsPluginOptions } from './types'; import { RobotsService } from './domain/RobotsService'; import { RobotsError } from './errors/RobotsError'; @@ -8,10 +8,12 @@ import { ERROR_MESSAGES } from './errors/messages'; export class RobotsInterceptor { private robotsService: IRobotsService; private userAgent: string; + private complyWithCrawlDelay: boolean; constructor(options: RobotsPluginOptions, robotsService?: IRobotsService) { this.robotsService = robotsService || new RobotsService(); this.userAgent = options.userAgent; + this.complyWithCrawlDelay = options.complyWithCrawlDelay ?? true; } /** @@ -31,6 +33,10 @@ export class RobotsInterceptor { throw new RobotsError(ERROR_MESSAGES.ROBOTS_DENIED(url.toString(), this.userAgent)); } + if (this.complyWithCrawlDelay) { + await this.handleCrawlDelay(url.toString()); + } + if (config.headers) { config.headers.set(HEADER_USER_AGENT, this.userAgent); } @@ -38,6 +44,49 @@ export class RobotsInterceptor { return config; } + /** + * Intercepts Axios responses to update the last crawled timestamp. + */ + public interceptResponse(response: AxiosResponse): AxiosResponse { + if (response && response.config && response.config.url) { + try { + const fullUrl = this.resolveUrl(response.config as InternalAxiosRequestConfig).toString(); + this.robotsService.setLastCrawled(fullUrl, Date.now()); + } catch (_) { + } + } + return response; + } + + /** + * Intercepts Axios response errors to update the last crawled timestamp, + * ensuring we track attempts even if they fail. + */ + public interceptResponseError(error: any): any { + if (error && error.config && error.config.url) { + try { + const fullUrl = this.resolveUrl(error.config as InternalAxiosRequestConfig).toString(); + this.robotsService.setLastCrawled(fullUrl, Date.now()); + } catch (_) { + } + } + return Promise.reject(error); + } + + private async handleCrawlDelay(url: string): Promise { + const robot = await this.robotsService.getRobot(url, this.userAgent); + if (robot && robot.robot) { + const delay = robot.robot.getCrawlDelay(this.userAgent); + if (delay && delay > 0 && robot.lastCrawled) { + const timeSinceLastCrawl = Date.now() - robot.lastCrawled; + const waitTime = (delay * 1000) - timeSinceLastCrawl; + if (waitTime > 0) { + await new Promise(resolve => setTimeout(resolve, waitTime)); + } + } + } + } + private resolveUrl(config: InternalAxiosRequestConfig): URL { try { if (config.url && (config.url.startsWith(PROTOCOL_HTTP) || config.url.startsWith(PROTOCOL_HTTPS))) { diff --git a/src/types.ts b/src/types.ts index 6241cd6..6ef34fe 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,3 +1,5 @@ +import { Robot } from 'robots-parser'; + /** * Options for the Robots Exclusion Protocol plugin. */ @@ -6,6 +8,12 @@ export interface RobotsPluginOptions { * The User-Agent string to use when checking robots.txt rules. */ userAgent: string; + complyWithCrawlDelay?: boolean; +} + +export interface CachedRobot { + robot: Robot; + lastCrawled?: number; } /** @@ -13,4 +21,6 @@ export interface RobotsPluginOptions { */ export interface IRobotsService { isAllowed(url: string, userAgent?: string): Promise; + getRobot(url: string, userAgent?: string): Promise; + setLastCrawled(url: string, timestamp: number): void; } diff --git a/tests/crawl-delay.test.ts b/tests/crawl-delay.test.ts new file mode 100644 index 0000000..2f009cf --- /dev/null +++ b/tests/crawl-delay.test.ts @@ -0,0 +1,107 @@ +import axios from 'axios'; +import nock from 'nock'; +import { applyRobotsInterceptor } from '../src/index'; + +describe('Crawl-delay Compliance', () => { + let client: ReturnType; + const USER_AGENT = 'CrawlBot/1.0'; + const DOMAIN = 'https://crawl-delay.com'; + + beforeEach(() => { + nock.cleanAll(); + client = axios.create(); + jest.useFakeTimers({ + doNotFake: ['nextTick', 'setImmediate'] + }); + }); + + afterEach(() => { + jest.useRealTimers(); + }); + + test.each([ + [1, 1000], + [2, 2000], + [3, 3000] + ])('GIVEN a robots.txt with Crawl-delay: %i WHEN making consecutive requests THEN the second request should wait at least %i ms', async (delaySeconds, expectedDelayMs) => { + applyRobotsInterceptor(client, { userAgent: USER_AGENT, complyWithCrawlDelay: true }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: ${delaySeconds} + Allow: / + `); + + nock(DOMAIN).get('/one').reply(200, 'One'); + nock(DOMAIN).get('/two').reply(200, 'Two'); + + await client.get(`${DOMAIN}/one`); + const afterFirst = Date.now(); + + const requestPromise = client.get(`${DOMAIN}/two`); + + jest.advanceTimersByTime(expectedDelayMs); + + await requestPromise; + const end = Date.now(); + + const duration = end - afterFirst; + expect(duration).toBeGreaterThanOrEqual(expectedDelayMs); + }); + + test('GIVEN a request fails WHEN making a subsequent request THEN it should still respect the Crawl-delay', async () => { + applyRobotsInterceptor(client, { userAgent: USER_AGENT, complyWithCrawlDelay: true }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: 2 + Allow: / + `); + + nock(DOMAIN).get('/fail').reply(500, 'Server Error'); + nock(DOMAIN).get('/success').reply(200, 'Success'); + + try { + await client.get(`${DOMAIN}/fail`); + } catch (e) { + } + const afterFail = Date.now(); + + const requestPromise = client.get(`${DOMAIN}/success`); + + jest.advanceTimersByTime(2000); + + await requestPromise; + const end = Date.now(); + + const duration = end - afterFail; + expect(duration).toBeGreaterThanOrEqual(2000); + }); + + test('GIVEN complyWithCrawlDelay is false WHEN making consecutive requests THEN the second request should NOT wait', async () => { + applyRobotsInterceptor(client, { userAgent: USER_AGENT, complyWithCrawlDelay: false }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: 5 + Allow: / + `); + + nock(DOMAIN).get('/one').reply(200, 'One'); + nock(DOMAIN).get('/two').reply(200, 'Two'); + + const start = Date.now(); + await client.get(`${DOMAIN}/one`); + await client.get(`${DOMAIN}/two`); + const end = Date.now(); + + const duration = end - start; + expect(duration).toBeLessThan(1000); + }); +}); From dd0f6dccee8bcd90414e86fc7ef3850d801e4495 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:43:42 +0000 Subject: [PATCH 02/12] Add crawl delay mode --- README.md | 10 ++++-- src/interceptor.ts | 72 +++++++++++++++++++++++---------------- src/types.ts | 17 ++++++++- tests/crawl-delay.test.ts | 31 ++++++++++++----- 4 files changed, 88 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 0c591aa..c1a64c5 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,6 @@ const client = axios.create(); // Apply the interceptor applyRobotsInterceptor(client, { userAgent: 'MyCoolBot/1.0', - complyWithCrawlDelay: true // default is true }); async function crawl() { @@ -83,7 +82,12 @@ Attaches the interceptor to the provided Axios instance. ```typescript interface RobotsPluginOptions { userAgent: string; - complyWithCrawlDelay?: boolean; // default: true + crawlDelayCompliance?: CrawlDelayComplianceMode; // default: CrawlDelayComplianceMode.Await +} + +enum CrawlDelayComplianceMode { + Await = 'await', // Respects delay by waiting + Ignore = 'ignore' // Ignores delay } ``` @@ -114,9 +118,9 @@ The interceptor throws a `RobotsError` in the following cases: - [x] **[RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html) Compliance**: Full support for the standard Robots Exclusion Protocol. - [x] **Standard Directives**: Supports `User-agent`, `Allow`, and `Disallow`. - [x] **Wildcards**: Supports standard path matching including `*` and `$`. +- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured. ### 🚧 Missing / TODO -- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured. - [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer. - [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance. diff --git a/src/interceptor.ts b/src/interceptor.ts index c26f5a1..9524fe9 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,5 +1,5 @@ import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; -import { IRobotsService, RobotsPluginOptions } from './types'; +import { IRobotsService, RobotsPluginOptions, CrawlDelayComplianceMode } from './types'; import { RobotsService } from './domain/RobotsService'; import { RobotsError } from './errors/RobotsError'; import { HEADER_USER_AGENT, PROTOCOL_HTTP, PROTOCOL_HTTPS } from './constants'; @@ -8,12 +8,12 @@ import { ERROR_MESSAGES } from './errors/messages'; export class RobotsInterceptor { private robotsService: IRobotsService; private userAgent: string; - private complyWithCrawlDelay: boolean; + private crawlDelayCompliance: CrawlDelayComplianceMode; constructor(options: RobotsPluginOptions, robotsService?: IRobotsService) { this.robotsService = robotsService || new RobotsService(); this.userAgent = options.userAgent; - this.complyWithCrawlDelay = options.complyWithCrawlDelay ?? true; + this.crawlDelayCompliance = options.crawlDelayCompliance ?? CrawlDelayComplianceMode.Await; } /** @@ -33,7 +33,7 @@ export class RobotsInterceptor { throw new RobotsError(ERROR_MESSAGES.ROBOTS_DENIED(url.toString(), this.userAgent)); } - if (this.complyWithCrawlDelay) { + if (this.crawlDelayCompliance === CrawlDelayComplianceMode.Await) { await this.handleCrawlDelay(url.toString()); } @@ -48,13 +48,16 @@ export class RobotsInterceptor { * Intercepts Axios responses to update the last crawled timestamp. */ public interceptResponse(response: AxiosResponse): AxiosResponse { - if (response && response.config && response.config.url) { - try { - const fullUrl = this.resolveUrl(response.config as InternalAxiosRequestConfig).toString(); - this.robotsService.setLastCrawled(fullUrl, Date.now()); - } catch (_) { - } + if (!response || !response.config || !response.config.url) { + return response; } + + try { + const fullUrl = this.resolveUrl(response.config as InternalAxiosRequestConfig).toString(); + this.robotsService.setLastCrawled(fullUrl, Date.now()); + } catch (_) { + } + return response; } @@ -63,28 +66,38 @@ export class RobotsInterceptor { * ensuring we track attempts even if they fail. */ public interceptResponseError(error: any): any { - if (error && error.config && error.config.url) { - try { - const fullUrl = this.resolveUrl(error.config as InternalAxiosRequestConfig).toString(); - this.robotsService.setLastCrawled(fullUrl, Date.now()); - } catch (_) { - } + if (!error || !error.config || !error.config.url) { + return Promise.reject(error); } + + try { + const fullUrl = this.resolveUrl(error.config as InternalAxiosRequestConfig).toString(); + this.robotsService.setLastCrawled(fullUrl, Date.now()); + } catch (_) { + } + return Promise.reject(error); } private async handleCrawlDelay(url: string): Promise { - const robot = await this.robotsService.getRobot(url, this.userAgent); - if (robot && robot.robot) { - const delay = robot.robot.getCrawlDelay(this.userAgent); - if (delay && delay > 0 && robot.lastCrawled) { - const timeSinceLastCrawl = Date.now() - robot.lastCrawled; - const waitTime = (delay * 1000) - timeSinceLastCrawl; - if (waitTime > 0) { - await new Promise(resolve => setTimeout(resolve, waitTime)); - } - } - } + const cachedRobot = await this.robotsService.getRobot(url, this.userAgent); + + if (!cachedRobot || !cachedRobot.robot) + return; + + + const delay = cachedRobot.robot.getCrawlDelay(this.userAgent); + if (!delay || delay <= 0 || !cachedRobot.lastCrawled) + return; + + + const timeSinceLastCrawl = Date.now() - cachedRobot.lastCrawled; + const waitTime = (delay * 1000) - timeSinceLastCrawl; + if (waitTime <= 0) + return; + + + await new Promise(resolve => setTimeout(resolve, waitTime)); } private resolveUrl(config: InternalAxiosRequestConfig): URL { @@ -104,8 +117,7 @@ export class RobotsInterceptor { } private validateProtocol(url: URL): void { - if (url.protocol !== PROTOCOL_HTTP && url.protocol !== PROTOCOL_HTTPS) { - throw new RobotsError(ERROR_MESSAGES.INVALID_PROTOCOL(url.protocol)); - } + if (url.protocol === PROTOCOL_HTTP || url.protocol === PROTOCOL_HTTPS) return; + throw new RobotsError(ERROR_MESSAGES.INVALID_PROTOCOL(url.protocol)); } } diff --git a/src/types.ts b/src/types.ts index 6ef34fe..a5ed150 100644 --- a/src/types.ts +++ b/src/types.ts @@ -3,12 +3,27 @@ import { Robot } from 'robots-parser'; /** * Options for the Robots Exclusion Protocol plugin. */ +export enum CrawlDelayComplianceMode { + /** + * Respects the Crawl-delay directive by waiting before making the request. + */ + Await = 'await', + /** + * Ignores the Crawl-delay directive. + */ + Ignore = 'ignore' +} + export interface RobotsPluginOptions { /** * The User-Agent string to use when checking robots.txt rules. */ userAgent: string; - complyWithCrawlDelay?: boolean; + /** + * How to handle Crawl-delay directives. + * Defaults to CrawlDelayComplianceMode.Await + */ + crawlDelayCompliance?: CrawlDelayComplianceMode; } export interface CachedRobot { diff --git a/tests/crawl-delay.test.ts b/tests/crawl-delay.test.ts index 2f009cf..544933a 100644 --- a/tests/crawl-delay.test.ts +++ b/tests/crawl-delay.test.ts @@ -1,6 +1,7 @@ import axios from 'axios'; import nock from 'nock'; import { applyRobotsInterceptor } from '../src/index'; +import { CrawlDelayComplianceMode } from '../src/types'; describe('Crawl-delay Compliance', () => { let client: ReturnType; @@ -10,6 +11,7 @@ describe('Crawl-delay Compliance', () => { beforeEach(() => { nock.cleanAll(); client = axios.create(); + // Exclude nextTick and setImmediate to prevent hanging nock/axios promises jest.useFakeTimers({ doNotFake: ['nextTick', 'setImmediate'] }); @@ -24,7 +26,10 @@ describe('Crawl-delay Compliance', () => { [2, 2000], [3, 3000] ])('GIVEN a robots.txt with Crawl-delay: %i WHEN making consecutive requests THEN the second request should wait at least %i ms', async (delaySeconds, expectedDelayMs) => { - applyRobotsInterceptor(client, { userAgent: USER_AGENT, complyWithCrawlDelay: true }); + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Await + }); nock(DOMAIN) .get('/robots.txt') @@ -41,9 +46,10 @@ describe('Crawl-delay Compliance', () => { const afterFirst = Date.now(); const requestPromise = client.get(`${DOMAIN}/two`); - + + // Fast-forward time to simulate the delay jest.advanceTimersByTime(expectedDelayMs); - + await requestPromise; const end = Date.now(); @@ -52,7 +58,10 @@ describe('Crawl-delay Compliance', () => { }); test('GIVEN a request fails WHEN making a subsequent request THEN it should still respect the Crawl-delay', async () => { - applyRobotsInterceptor(client, { userAgent: USER_AGENT, complyWithCrawlDelay: true }); + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Await + }); nock(DOMAIN) .get('/robots.txt') @@ -65,16 +74,19 @@ describe('Crawl-delay Compliance', () => { nock(DOMAIN).get('/fail').reply(500, 'Server Error'); nock(DOMAIN).get('/success').reply(200, 'Success'); + // First request fails try { await client.get(`${DOMAIN}/fail`); } catch (e) { + // Expected error } const afterFail = Date.now(); const requestPromise = client.get(`${DOMAIN}/success`); - + + // Fast-forward time to simulate the delay (2000ms) jest.advanceTimersByTime(2000); - + await requestPromise; const end = Date.now(); @@ -82,8 +94,11 @@ describe('Crawl-delay Compliance', () => { expect(duration).toBeGreaterThanOrEqual(2000); }); - test('GIVEN complyWithCrawlDelay is false WHEN making consecutive requests THEN the second request should NOT wait', async () => { - applyRobotsInterceptor(client, { userAgent: USER_AGENT, complyWithCrawlDelay: false }); + test('GIVEN crawlDelayCompliance is Ignore WHEN making consecutive requests THEN the second request should NOT wait', async () => { + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Ignore + }); nock(DOMAIN) .get('/robots.txt') From 662907cc3c3c18fd5d5652439aba508ea9364e96 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:50:10 +0000 Subject: [PATCH 03/12] Add new compliance mode Separate errors --- README.md | 3 ++- src/domain/RobotsService.ts | 5 ++--- src/errors/CrawlDelayError.ts | 9 +++++++++ src/errors/InvalidProtocolError.ts | 9 +++++++++ src/errors/InvalidUrlError.ts | 9 +++++++++ src/errors/RobotsDeniedError.ts | 9 +++++++++ src/errors/RobotsUnreachableError.ts | 9 +++++++++ src/errors/index.ts | 7 +++++++ src/errors/messages.ts | 1 + src/index.ts | 2 +- src/interceptor.ts | 17 +++++++++++------ src/types.ts | 6 +++++- tests/crawl-delay.test.ts | 24 +++++++++++++++++++++++- 13 files changed, 97 insertions(+), 13 deletions(-) create mode 100644 src/errors/CrawlDelayError.ts create mode 100644 src/errors/InvalidProtocolError.ts create mode 100644 src/errors/InvalidUrlError.ts create mode 100644 src/errors/RobotsDeniedError.ts create mode 100644 src/errors/RobotsUnreachableError.ts create mode 100644 src/errors/index.ts diff --git a/README.md b/README.md index c1a64c5..3f0bc20 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,8 @@ interface RobotsPluginOptions { enum CrawlDelayComplianceMode { Await = 'await', // Respects delay by waiting - Ignore = 'ignore' // Ignores delay + Ignore = 'ignore', // Ignores delay + Failure = 'failure' // Throws Error if delay is not met } ``` diff --git a/src/domain/RobotsService.ts b/src/domain/RobotsService.ts index b646251..f48b41f 100644 --- a/src/domain/RobotsService.ts +++ b/src/domain/RobotsService.ts @@ -1,8 +1,7 @@ import robotsParser, { Robot } from 'robots-parser'; import axios from 'axios'; import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../constants'; -import { RobotsError } from '../errors/RobotsError'; -import { ERROR_MESSAGES } from '../errors/messages'; +import { RobotsUnreachableError } from '../errors/RobotsUnreachableError'; import { IRobotsService, CachedRobot } from '../types'; @@ -70,7 +69,7 @@ export class RobotsService implements IRobotsService { return robotsParser(robotsUrl, ALLOW_ALL_ROBOTS_TXT_CONTENT); } - throw new RobotsError(ERROR_MESSAGES.ROBOTS_UNREACHABLE(error.message)); + throw new RobotsUnreachableError(error.message); } } diff --git a/src/errors/CrawlDelayError.ts b/src/errors/CrawlDelayError.ts new file mode 100644 index 0000000..331c7b4 --- /dev/null +++ b/src/errors/CrawlDelayError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class CrawlDelayError extends RobotsError { + constructor(delay: number) { + super(ERROR_MESSAGES.ROBOTS_CRAWL_DELAY(delay)); + this.name = 'CrawlDelayError'; + } +} diff --git a/src/errors/InvalidProtocolError.ts b/src/errors/InvalidProtocolError.ts new file mode 100644 index 0000000..5f4e9df --- /dev/null +++ b/src/errors/InvalidProtocolError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class InvalidProtocolError extends RobotsError { + constructor(protocol: string) { + super(ERROR_MESSAGES.INVALID_PROTOCOL(protocol)); + this.name = 'InvalidProtocolError'; + } +} diff --git a/src/errors/InvalidUrlError.ts b/src/errors/InvalidUrlError.ts new file mode 100644 index 0000000..c199ddc --- /dev/null +++ b/src/errors/InvalidUrlError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class InvalidUrlError extends RobotsError { + constructor(details: string) { + super(ERROR_MESSAGES.INVALID_URL(details)); + this.name = 'InvalidUrlError'; + } +} diff --git a/src/errors/RobotsDeniedError.ts b/src/errors/RobotsDeniedError.ts new file mode 100644 index 0000000..ebeb7a6 --- /dev/null +++ b/src/errors/RobotsDeniedError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class RobotsDeniedError extends RobotsError { + constructor(url: string, userAgent: string) { + super(ERROR_MESSAGES.ROBOTS_DENIED(url, userAgent)); + this.name = 'RobotsDeniedError'; + } +} diff --git a/src/errors/RobotsUnreachableError.ts b/src/errors/RobotsUnreachableError.ts new file mode 100644 index 0000000..d5b4828 --- /dev/null +++ b/src/errors/RobotsUnreachableError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class RobotsUnreachableError extends RobotsError { + constructor(details: string) { + super(ERROR_MESSAGES.ROBOTS_UNREACHABLE(details)); + this.name = 'RobotsUnreachableError'; + } +} diff --git a/src/errors/index.ts b/src/errors/index.ts new file mode 100644 index 0000000..20547b3 --- /dev/null +++ b/src/errors/index.ts @@ -0,0 +1,7 @@ +export * from './RobotsError'; +export * from './CrawlDelayError'; +export * from './InvalidUrlError'; +export * from './InvalidProtocolError'; +export * from './RobotsDeniedError'; +export * from './RobotsUnreachableError'; +export * from './messages'; diff --git a/src/errors/messages.ts b/src/errors/messages.ts index 6bb8c0a..ed2db9e 100644 --- a/src/errors/messages.ts +++ b/src/errors/messages.ts @@ -3,5 +3,6 @@ export const ERROR_MESSAGES = { INVALID_PROTOCOL: (protocol: string) => `Invalid protocol: ${protocol}. Only HTTP/S is supported for robots.txt compliance.`, ROBOTS_DENIED: (url: string, userAgent: string) => `URL ${url} is assumed to be disallowed by robots.txt for User-Agent ${userAgent}`, ROBOTS_UNREACHABLE: (details: string) => `Unable to fetch robots.txt: ${details}`, + ROBOTS_CRAWL_DELAY: (delay: number) => `Request blocked: Crawl-delay of ${delay}s has not been met.`, DEFAULT_BLOCK: 'Request blocked by robots.txt', }; diff --git a/src/index.ts b/src/index.ts index bfb7494..fc6acaf 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,7 +3,7 @@ import { RobotsInterceptor } from './interceptor'; import { RobotsPluginOptions } from './types'; export * from './domain/RobotsService'; -export * from './errors/RobotsError'; +export * from './errors'; export * from './interceptor'; export * from './types'; diff --git a/src/interceptor.ts b/src/interceptor.ts index 9524fe9..efd9d9a 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,9 +1,11 @@ import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; import { IRobotsService, RobotsPluginOptions, CrawlDelayComplianceMode } from './types'; import { RobotsService } from './domain/RobotsService'; -import { RobotsError } from './errors/RobotsError'; +import { CrawlDelayError } from './errors/CrawlDelayError'; +import { InvalidUrlError } from './errors/InvalidUrlError'; +import { InvalidProtocolError } from './errors/InvalidProtocolError'; +import { RobotsDeniedError } from './errors/RobotsDeniedError'; import { HEADER_USER_AGENT, PROTOCOL_HTTP, PROTOCOL_HTTPS } from './constants'; -import { ERROR_MESSAGES } from './errors/messages'; export class RobotsInterceptor { private robotsService: IRobotsService; @@ -30,10 +32,10 @@ export class RobotsInterceptor { const isAllowed = await this.robotsService.isAllowed(url.toString(), this.userAgent); if (!isAllowed) { - throw new RobotsError(ERROR_MESSAGES.ROBOTS_DENIED(url.toString(), this.userAgent)); + throw new RobotsDeniedError(url.toString(), this.userAgent); } - if (this.crawlDelayCompliance === CrawlDelayComplianceMode.Await) { + if (this.crawlDelayCompliance !== CrawlDelayComplianceMode.Ignore) { await this.handleCrawlDelay(url.toString()); } @@ -96,6 +98,9 @@ export class RobotsInterceptor { if (waitTime <= 0) return; + if (this.crawlDelayCompliance === CrawlDelayComplianceMode.Failure) { + throw new CrawlDelayError(delay); + } await new Promise(resolve => setTimeout(resolve, waitTime)); } @@ -112,12 +117,12 @@ export class RobotsInterceptor { return new URL(config.url || ''); } catch (e: any) { - throw new RobotsError(ERROR_MESSAGES.INVALID_URL(e.message)); + throw new InvalidUrlError(e.message); } } private validateProtocol(url: URL): void { if (url.protocol === PROTOCOL_HTTP || url.protocol === PROTOCOL_HTTPS) return; - throw new RobotsError(ERROR_MESSAGES.INVALID_PROTOCOL(url.protocol)); + throw new InvalidProtocolError(url.protocol); } } diff --git a/src/types.ts b/src/types.ts index a5ed150..d74ffd1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -11,7 +11,11 @@ export enum CrawlDelayComplianceMode { /** * Ignores the Crawl-delay directive. */ - Ignore = 'ignore' + Ignore = 'ignore', + /** + * Throws an error if the request violates the Crawl-delay. + */ + Failure = 'failure' } export interface RobotsPluginOptions { diff --git a/tests/crawl-delay.test.ts b/tests/crawl-delay.test.ts index 544933a..ea746a2 100644 --- a/tests/crawl-delay.test.ts +++ b/tests/crawl-delay.test.ts @@ -117,6 +117,28 @@ describe('Crawl-delay Compliance', () => { const end = Date.now(); const duration = end - start; - expect(duration).toBeLessThan(1000); + expect(duration).toBeLessThan(1000); + }); + + test('GIVEN crawlDelayCompliance is Failure WHEN making consecutive requests THEN the second request should throw', async () => { + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Failure + }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: 5 + Allow: / + `); + + nock(DOMAIN).get('/one').reply(200, 'One'); + nock(DOMAIN).get('/two').reply(200, 'Two'); + + await client.get(`${DOMAIN}/one`); + + await expect(client.get(`${DOMAIN}/two`)).rejects.toThrow('Crawl-delay of 5s has not been met'); }); }); From 551be1ba49eaeda0bbce284cc071f7054bcc474a Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 19:44:42 +0000 Subject: [PATCH 04/12] split RobotsService into smaller services --- src/domain/AllowService.ts | 19 +++++++ src/domain/CrawlDelayService.ts | 41 +++++++++++++ ...{RobotsService.ts => RobotsDataService.ts} | 22 +------ src/index.ts | 4 +- src/interceptor.ts | 57 ++++++++----------- src/types.ts | 16 ++++-- 6 files changed, 99 insertions(+), 60 deletions(-) create mode 100644 src/domain/AllowService.ts create mode 100644 src/domain/CrawlDelayService.ts rename src/domain/{RobotsService.ts => RobotsDataService.ts} (75%) diff --git a/src/domain/AllowService.ts b/src/domain/AllowService.ts new file mode 100644 index 0000000..2b281ed --- /dev/null +++ b/src/domain/AllowService.ts @@ -0,0 +1,19 @@ +import { IAllowService, IRobotsDataService } from '../types'; + +export class AllowService implements IAllowService { + constructor(private dataService: IRobotsDataService) { } + + /** + * Checks if the given URL is allowed for the specified User-Agent. + * Fetching and caching the robots.txt is handled automatically by the data service. + */ + async isAllowed(url: string, userAgent: string = '*'): Promise { + const robot = await this.dataService.getRobot(url, userAgent); + + if (!robot || !robot.robot) { + return true; + } + + return robot.robot.isAllowed(url, userAgent) ?? true; + } +} diff --git a/src/domain/CrawlDelayService.ts b/src/domain/CrawlDelayService.ts new file mode 100644 index 0000000..0388255 --- /dev/null +++ b/src/domain/CrawlDelayService.ts @@ -0,0 +1,41 @@ +import { RobotsDataService } from './RobotsDataService'; +import { CrawlDelayComplianceMode, ICrawlDelayService, IRobotsDataService } from '../types'; +import { CrawlDelayError } from '../errors/CrawlDelayError'; + +export class CrawlDelayService implements ICrawlDelayService { + constructor(private dataService: IRobotsDataService) {} + + /** + * Handles the crawl delay for the given URL and user agent. + * Enforces the delay based on the compliance mode. + */ + async handleCrawlDelay( + url: string, + userAgent: string, + complianceMode: CrawlDelayComplianceMode + ): Promise { + if (complianceMode === CrawlDelayComplianceMode.Ignore) { + return; + } + + const cachedRobot = await this.dataService.getRobot(url, userAgent); + + if (!cachedRobot || !cachedRobot.robot) + return; + + const delay = cachedRobot.robot.getCrawlDelay(userAgent); + if (!delay || delay <= 0 || !cachedRobot.lastCrawled) + return; + + const timeSinceLastCrawl = Date.now() - cachedRobot.lastCrawled; + const waitTime = (delay * 1000) - timeSinceLastCrawl; + if (waitTime <= 0) + return; + + if (complianceMode === CrawlDelayComplianceMode.Failure) { + throw new CrawlDelayError(delay); + } + + await new Promise(resolve => setTimeout(resolve, waitTime)); + } +} diff --git a/src/domain/RobotsService.ts b/src/domain/RobotsDataService.ts similarity index 75% rename from src/domain/RobotsService.ts rename to src/domain/RobotsDataService.ts index f48b41f..349c2b7 100644 --- a/src/domain/RobotsService.ts +++ b/src/domain/RobotsDataService.ts @@ -2,32 +2,16 @@ import robotsParser, { Robot } from 'robots-parser'; import axios from 'axios'; import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../constants'; import { RobotsUnreachableError } from '../errors/RobotsUnreachableError'; +import { IRobotsDataService, CachedRobot } from '../types'; -import { IRobotsService, CachedRobot } from '../types'; - -export class RobotsService implements IRobotsService { +export class RobotsDataService implements IRobotsDataService { private cache: Map = new Map(); - /** - * Checks if the given URL is allowed for the specified User-Agent. - * Fetching and caching the robots.txt is handled automatically. - */ - async isAllowed(url: string, userAgent: string = '*'): Promise { - const robot = await this.getRobot(url, userAgent); - - if (!robot) { - // Should not happen as getRobot handles fetching, but safety check - return true; - } - - return robot.robot.isAllowed(url, userAgent) ?? true; - } - /** * Retrieves the cached robot rules for the given URL's origin. * Fetches from the network if not already cached. */ - async getRobot(url: string, userAgent: string = '*'): Promise { + async getRobot(url: string, userAgent: string = '*'): Promise { const origin = new URL(url).origin; let cached = this.cache.get(origin); diff --git a/src/index.ts b/src/index.ts index fc6acaf..e6d082e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,7 +2,9 @@ import { AxiosInstance } from 'axios'; import { RobotsInterceptor } from './interceptor'; import { RobotsPluginOptions } from './types'; -export * from './domain/RobotsService'; +export * from './domain/RobotsDataService'; +export * from './domain/AllowService'; +export * from './domain/CrawlDelayService'; export * from './errors'; export * from './interceptor'; export * from './types'; diff --git a/src/interceptor.ts b/src/interceptor.ts index efd9d9a..b5ff4c2 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,21 +1,34 @@ import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; -import { IRobotsService, RobotsPluginOptions, CrawlDelayComplianceMode } from './types'; -import { RobotsService } from './domain/RobotsService'; -import { CrawlDelayError } from './errors/CrawlDelayError'; +import { RobotsPluginOptions, CrawlDelayComplianceMode, IRobotsDataService, IAllowService, ICrawlDelayService } from './types'; +import { RobotsDataService } from './domain/RobotsDataService'; +import { AllowService } from './domain/AllowService'; +import { CrawlDelayService } from './domain/CrawlDelayService'; import { InvalidUrlError } from './errors/InvalidUrlError'; import { InvalidProtocolError } from './errors/InvalidProtocolError'; import { RobotsDeniedError } from './errors/RobotsDeniedError'; import { HEADER_USER_AGENT, PROTOCOL_HTTP, PROTOCOL_HTTPS } from './constants'; export class RobotsInterceptor { - private robotsService: IRobotsService; + private dataService: IRobotsDataService; + private allowService: IAllowService; + private crawlDelayService: ICrawlDelayService; private userAgent: string; private crawlDelayCompliance: CrawlDelayComplianceMode; - constructor(options: RobotsPluginOptions, robotsService?: IRobotsService) { - this.robotsService = robotsService || new RobotsService(); + constructor( + options: RobotsPluginOptions, + deps?: { + dataService?: IRobotsDataService, + allowService?: IAllowService, + crawlDelayService?: ICrawlDelayService; + } + ) { this.userAgent = options.userAgent; this.crawlDelayCompliance = options.crawlDelayCompliance ?? CrawlDelayComplianceMode.Await; + + this.dataService = deps?.dataService ?? new RobotsDataService(); + this.allowService = deps?.allowService ?? new AllowService(this.dataService); + this.crawlDelayService = deps?.crawlDelayService ?? new CrawlDelayService(this.dataService); } /** @@ -29,14 +42,14 @@ export class RobotsInterceptor { const url = this.resolveUrl(config); this.validateProtocol(url); - const isAllowed = await this.robotsService.isAllowed(url.toString(), this.userAgent); + const isAllowed = await this.allowService.isAllowed(url.toString(), this.userAgent); if (!isAllowed) { throw new RobotsDeniedError(url.toString(), this.userAgent); } if (this.crawlDelayCompliance !== CrawlDelayComplianceMode.Ignore) { - await this.handleCrawlDelay(url.toString()); + await this.crawlDelayService.handleCrawlDelay(url.toString(), this.userAgent, this.crawlDelayCompliance); } if (config.headers) { @@ -56,7 +69,7 @@ export class RobotsInterceptor { try { const fullUrl = this.resolveUrl(response.config as InternalAxiosRequestConfig).toString(); - this.robotsService.setLastCrawled(fullUrl, Date.now()); + this.dataService.setLastCrawled(fullUrl, Date.now()); } catch (_) { } @@ -74,37 +87,13 @@ export class RobotsInterceptor { try { const fullUrl = this.resolveUrl(error.config as InternalAxiosRequestConfig).toString(); - this.robotsService.setLastCrawled(fullUrl, Date.now()); + this.dataService.setLastCrawled(fullUrl, Date.now()); } catch (_) { } return Promise.reject(error); } - private async handleCrawlDelay(url: string): Promise { - const cachedRobot = await this.robotsService.getRobot(url, this.userAgent); - - if (!cachedRobot || !cachedRobot.robot) - return; - - - const delay = cachedRobot.robot.getCrawlDelay(this.userAgent); - if (!delay || delay <= 0 || !cachedRobot.lastCrawled) - return; - - - const timeSinceLastCrawl = Date.now() - cachedRobot.lastCrawled; - const waitTime = (delay * 1000) - timeSinceLastCrawl; - if (waitTime <= 0) - return; - - if (this.crawlDelayCompliance === CrawlDelayComplianceMode.Failure) { - throw new CrawlDelayError(delay); - } - - await new Promise(resolve => setTimeout(resolve, waitTime)); - } - private resolveUrl(config: InternalAxiosRequestConfig): URL { try { if (config.url && (config.url.startsWith(PROTOCOL_HTTP) || config.url.startsWith(PROTOCOL_HTTPS))) { diff --git a/src/types.ts b/src/types.ts index d74ffd1..6016024 100644 --- a/src/types.ts +++ b/src/types.ts @@ -35,11 +35,15 @@ export interface CachedRobot { lastCrawled?: number; } -/** - * Interface for the Robots Service. - */ -export interface IRobotsService { - isAllowed(url: string, userAgent?: string): Promise; - getRobot(url: string, userAgent?: string): Promise; +export interface IRobotsDataService { + getRobot(url: string, userAgent?: string): Promise; setLastCrawled(url: string, timestamp: number): void; } + +export interface IAllowService { + isAllowed(url: string, userAgent?: string): Promise; +} + +export interface ICrawlDelayService { + handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise; +} From 3c137ca86a98381e8d63550bf98d3e90a4abbd2c Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 19:52:44 +0000 Subject: [PATCH 05/12] move services --- src/domain/CrawlDelayService.ts | 41 ------------------- src/domain/{ => services}/AllowService.ts | 2 +- src/domain/services/CrawlDelayService.ts | 26 ++++++++++++ .../{ => services}/RobotsDataService.ts | 6 +-- 4 files changed, 30 insertions(+), 45 deletions(-) delete mode 100644 src/domain/CrawlDelayService.ts rename src/domain/{ => services}/AllowService.ts (90%) create mode 100644 src/domain/services/CrawlDelayService.ts rename src/domain/{ => services}/RobotsDataService.ts (91%) diff --git a/src/domain/CrawlDelayService.ts b/src/domain/CrawlDelayService.ts deleted file mode 100644 index 0388255..0000000 --- a/src/domain/CrawlDelayService.ts +++ /dev/null @@ -1,41 +0,0 @@ -import { RobotsDataService } from './RobotsDataService'; -import { CrawlDelayComplianceMode, ICrawlDelayService, IRobotsDataService } from '../types'; -import { CrawlDelayError } from '../errors/CrawlDelayError'; - -export class CrawlDelayService implements ICrawlDelayService { - constructor(private dataService: IRobotsDataService) {} - - /** - * Handles the crawl delay for the given URL and user agent. - * Enforces the delay based on the compliance mode. - */ - async handleCrawlDelay( - url: string, - userAgent: string, - complianceMode: CrawlDelayComplianceMode - ): Promise { - if (complianceMode === CrawlDelayComplianceMode.Ignore) { - return; - } - - const cachedRobot = await this.dataService.getRobot(url, userAgent); - - if (!cachedRobot || !cachedRobot.robot) - return; - - const delay = cachedRobot.robot.getCrawlDelay(userAgent); - if (!delay || delay <= 0 || !cachedRobot.lastCrawled) - return; - - const timeSinceLastCrawl = Date.now() - cachedRobot.lastCrawled; - const waitTime = (delay * 1000) - timeSinceLastCrawl; - if (waitTime <= 0) - return; - - if (complianceMode === CrawlDelayComplianceMode.Failure) { - throw new CrawlDelayError(delay); - } - - await new Promise(resolve => setTimeout(resolve, waitTime)); - } -} diff --git a/src/domain/AllowService.ts b/src/domain/services/AllowService.ts similarity index 90% rename from src/domain/AllowService.ts rename to src/domain/services/AllowService.ts index 2b281ed..0da4ec9 100644 --- a/src/domain/AllowService.ts +++ b/src/domain/services/AllowService.ts @@ -1,4 +1,4 @@ -import { IAllowService, IRobotsDataService } from '../types'; +import { IAllowService, IRobotsDataService } from '../../types'; export class AllowService implements IAllowService { constructor(private dataService: IRobotsDataService) { } diff --git a/src/domain/services/CrawlDelayService.ts b/src/domain/services/CrawlDelayService.ts new file mode 100644 index 0000000..5364113 --- /dev/null +++ b/src/domain/services/CrawlDelayService.ts @@ -0,0 +1,26 @@ +import { CrawlDelayComplianceMode, ICrawlDelayService, IRobotsDataService } from '../../types'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; +import { CrawlDelayStrategyFactory } from '../strategies/CrawlDelayStrategyFactory'; + +export class CrawlDelayService implements ICrawlDelayService { + private calculateWaitTimeUseCase: CalculateWaitTimeUseCase; + private strategyFactory: CrawlDelayStrategyFactory; + + constructor(private dataService: IRobotsDataService) { + this.calculateWaitTimeUseCase = new CalculateWaitTimeUseCase(dataService); + this.strategyFactory = new CrawlDelayStrategyFactory(this.calculateWaitTimeUseCase); + } + + /** + * Handles the crawl delay for the given URL and user agent. + * Enforces the delay based on the compliance mode. + */ + async handleCrawlDelay( + url: string, + userAgent: string, + complianceMode: CrawlDelayComplianceMode + ): Promise { + const strategy = this.strategyFactory.getStrategy(complianceMode); + await strategy.execute(url, userAgent); + } +} diff --git a/src/domain/RobotsDataService.ts b/src/domain/services/RobotsDataService.ts similarity index 91% rename from src/domain/RobotsDataService.ts rename to src/domain/services/RobotsDataService.ts index 349c2b7..3c7a475 100644 --- a/src/domain/RobotsDataService.ts +++ b/src/domain/services/RobotsDataService.ts @@ -1,8 +1,8 @@ import robotsParser, { Robot } from 'robots-parser'; import axios from 'axios'; -import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../constants'; -import { RobotsUnreachableError } from '../errors/RobotsUnreachableError'; -import { IRobotsDataService, CachedRobot } from '../types'; +import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants'; +import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError'; +import { IRobotsDataService, CachedRobot } from '../../types'; export class RobotsDataService implements IRobotsDataService { private cache: Map = new Map(); From 8f386ea688d973afa136d3a5d6ef172306a6aef0 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 19:54:18 +0000 Subject: [PATCH 06/12] Add crawl delay strategies --- .../strategies/AwaitCrawlDelayStrategy.ts | 17 ++++++++ .../strategies/CrawlDelayStrategyFactory.ts | 22 ++++++++++ .../strategies/FailureCrawlDelayStrategy.ts | 16 ++++++++ src/domain/strategies/ICrawlDelayStrategy.ts | 4 ++ .../strategies/IgnoreCrawlDelayStrategy.ts | 8 ++++ .../usecases/CalculateWaitTimeUseCase.ts | 27 +++++++++++++ src/index.ts | 6 +-- src/interceptor.ts | 6 +-- src/types.ts | 40 +++++++++++++++++++ 9 files changed, 140 insertions(+), 6 deletions(-) create mode 100644 src/domain/strategies/AwaitCrawlDelayStrategy.ts create mode 100644 src/domain/strategies/CrawlDelayStrategyFactory.ts create mode 100644 src/domain/strategies/FailureCrawlDelayStrategy.ts create mode 100644 src/domain/strategies/ICrawlDelayStrategy.ts create mode 100644 src/domain/strategies/IgnoreCrawlDelayStrategy.ts create mode 100644 src/domain/usecases/CalculateWaitTimeUseCase.ts diff --git a/src/domain/strategies/AwaitCrawlDelayStrategy.ts b/src/domain/strategies/AwaitCrawlDelayStrategy.ts new file mode 100644 index 0000000..5c27ca2 --- /dev/null +++ b/src/domain/strategies/AwaitCrawlDelayStrategy.ts @@ -0,0 +1,17 @@ + +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; + +export class AwaitCrawlDelayStrategy implements ICrawlDelayStrategy { + constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { } + + async execute(url: string, userAgent: string): Promise { + const { waitTime } = await this.calculateWaitTimeUseCase.execute(url, userAgent); + + if (waitTime <= 0) + return; + + + await new Promise(resolve => setTimeout(resolve, waitTime)); + } +} diff --git a/src/domain/strategies/CrawlDelayStrategyFactory.ts b/src/domain/strategies/CrawlDelayStrategyFactory.ts new file mode 100644 index 0000000..1c72c58 --- /dev/null +++ b/src/domain/strategies/CrawlDelayStrategyFactory.ts @@ -0,0 +1,22 @@ + +import { CrawlDelayComplianceMode } from '../../types'; +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; +import { AwaitCrawlDelayStrategy } from './AwaitCrawlDelayStrategy'; +import { FailureCrawlDelayStrategy } from './FailureCrawlDelayStrategy'; +import { IgnoreCrawlDelayStrategy } from './IgnoreCrawlDelayStrategy'; + +export class CrawlDelayStrategyFactory { + constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { } + + getStrategy(mode: CrawlDelayComplianceMode): ICrawlDelayStrategy { + switch (mode) { + case CrawlDelayComplianceMode.Failure: + return new FailureCrawlDelayStrategy(this.calculateWaitTimeUseCase); + case CrawlDelayComplianceMode.Ignore: + return new IgnoreCrawlDelayStrategy(); + case CrawlDelayComplianceMode.Await: + return new AwaitCrawlDelayStrategy(this.calculateWaitTimeUseCase); + } + } +} diff --git a/src/domain/strategies/FailureCrawlDelayStrategy.ts b/src/domain/strategies/FailureCrawlDelayStrategy.ts new file mode 100644 index 0000000..5b6c25e --- /dev/null +++ b/src/domain/strategies/FailureCrawlDelayStrategy.ts @@ -0,0 +1,16 @@ + +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; +import { CrawlDelayError } from '../../errors/CrawlDelayError'; + +export class FailureCrawlDelayStrategy implements ICrawlDelayStrategy { + constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { } + + async execute(url: string, userAgent: string): Promise { + const { waitTime, delay } = await this.calculateWaitTimeUseCase.execute(url, userAgent); + + if (waitTime <= 0) return; + + throw new CrawlDelayError(delay); + } +} diff --git a/src/domain/strategies/ICrawlDelayStrategy.ts b/src/domain/strategies/ICrawlDelayStrategy.ts new file mode 100644 index 0000000..41f299b --- /dev/null +++ b/src/domain/strategies/ICrawlDelayStrategy.ts @@ -0,0 +1,4 @@ + +export interface ICrawlDelayStrategy { + execute(url: string, userAgent: string): Promise; +} diff --git a/src/domain/strategies/IgnoreCrawlDelayStrategy.ts b/src/domain/strategies/IgnoreCrawlDelayStrategy.ts new file mode 100644 index 0000000..bb141a7 --- /dev/null +++ b/src/domain/strategies/IgnoreCrawlDelayStrategy.ts @@ -0,0 +1,8 @@ + +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; + +export class IgnoreCrawlDelayStrategy implements ICrawlDelayStrategy { + async execute(url: string, userAgent: string): Promise { + return; + } +} diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts new file mode 100644 index 0000000..7d9a2bb --- /dev/null +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -0,0 +1,27 @@ + +import { IRobotsDataService } from '../../types'; + +export class CalculateWaitTimeUseCase { + constructor(private dataService: IRobotsDataService) { } + + async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> { + const cachedRobot = await this.dataService.getRobot(url, userAgent); + + if (!cachedRobot || !cachedRobot.robot) { + return { waitTime: 0, delay: 0 }; + } + + const delay = cachedRobot.robot.getCrawlDelay(userAgent); + if (!delay || delay <= 0 || !cachedRobot.lastCrawled) { + return { waitTime: 0, delay: 0 }; + } + + const timeSinceLastCrawl = Date.now() - cachedRobot.lastCrawled; + const waitTime = (delay * 1000) - timeSinceLastCrawl; + + return { + waitTime: waitTime > 0 ? waitTime : 0, + delay + }; + } +} diff --git a/src/index.ts b/src/index.ts index e6d082e..bdd07bc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,9 +2,9 @@ import { AxiosInstance } from 'axios'; import { RobotsInterceptor } from './interceptor'; import { RobotsPluginOptions } from './types'; -export * from './domain/RobotsDataService'; -export * from './domain/AllowService'; -export * from './domain/CrawlDelayService'; +export * from './domain/services/RobotsDataService'; +export * from './domain/services/AllowService'; +export * from './domain/services/CrawlDelayService'; export * from './errors'; export * from './interceptor'; export * from './types'; diff --git a/src/interceptor.ts b/src/interceptor.ts index b5ff4c2..c27a7f9 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,8 +1,8 @@ import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; import { RobotsPluginOptions, CrawlDelayComplianceMode, IRobotsDataService, IAllowService, ICrawlDelayService } from './types'; -import { RobotsDataService } from './domain/RobotsDataService'; -import { AllowService } from './domain/AllowService'; -import { CrawlDelayService } from './domain/CrawlDelayService'; +import { RobotsDataService } from './domain/services/RobotsDataService'; +import { AllowService } from './domain/services/AllowService'; +import { CrawlDelayService } from './domain/services/CrawlDelayService'; import { InvalidUrlError } from './errors/InvalidUrlError'; import { InvalidProtocolError } from './errors/InvalidProtocolError'; import { RobotsDeniedError } from './errors/RobotsDeniedError'; diff --git a/src/types.ts b/src/types.ts index 6016024..b03040d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -31,19 +31,59 @@ export interface RobotsPluginOptions { } export interface CachedRobot { + /** + * The parsed robots.txt object. + */ robot: Robot; + /** + * Timestamp of the last crawl for this domain. + */ lastCrawled?: number; } +/** + * Service for managing robots.txt data and crawl timestamps independently of the protocol logic. + */ export interface IRobotsDataService { + /** + * Retrieves the cached robot instance for a given URL. + * @param url The URL to get the robot for (used to extract the domain/origin). + * @param userAgent Optional user agent to use for fetching robots.txt if not cached. + * @returns A promise resolving to the CachedRobot containing the parsed rules. + */ getRobot(url: string, userAgent?: string): Promise; + + /** + * Updates the last crawled timestamp for the domain associated with the URL. + * @param url The URL identifying the domain. + * @param timestamp The timestamp to set. + */ setLastCrawled(url: string, timestamp: number): void; } +/** + * Service for checking if a URL is allowed to be crawled according to robots.txt rules. + */ export interface IAllowService { + /** + * Checks if the given URL is allowed for the specified user agent. + * @param url The URL to check. + * @param userAgent The user agent to check against. + * @returns A promise resolving to true if allowed, false otherwise. + */ isAllowed(url: string, userAgent?: string): Promise; } +/** + * Service for handling Crawl-delay directives from robots.txt. + */ export interface ICrawlDelayService { + /** + * Enforces the crawl delay for a given URL based on the compliance mode. + * @param url The URL about to be requested. + * @param userAgent The user agent to check rules for. + * @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure). + * @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode). + */ handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise; } From 284174a451f87ab510353252c7f7b570fcd5fc5c Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:01:53 +0000 Subject: [PATCH 07/12] move files around --- .../services/RobotsDataService.ts | 3 +- src/domain/interfaces/IAllowService.ts | 12 +++ src/domain/interfaces/ICrawlDelayService.ts | 15 ++++ src/domain/interfaces/IRobotsDataService.ts | 21 +++++ src/domain/models/CachedRobot.ts | 12 +++ src/domain/models/CrawlDelayComplianceMode.ts | 17 ++++ src/domain/models/RobotsPluginOptions.ts | 13 +++ src/domain/services/AllowService.ts | 3 +- src/domain/services/CrawlDelayService.ts | 4 +- .../strategies/CrawlDelayStrategyFactory.ts | 2 +- .../usecases/CalculateWaitTimeUseCase.ts | 2 +- src/index.ts | 11 ++- src/interceptor.ts | 8 +- src/types.ts | 89 ------------------- tests/crawl-delay.test.ts | 3 +- 15 files changed, 114 insertions(+), 101 deletions(-) rename src/{domain => data}/services/RobotsDataService.ts (93%) create mode 100644 src/domain/interfaces/IAllowService.ts create mode 100644 src/domain/interfaces/ICrawlDelayService.ts create mode 100644 src/domain/interfaces/IRobotsDataService.ts create mode 100644 src/domain/models/CachedRobot.ts create mode 100644 src/domain/models/CrawlDelayComplianceMode.ts create mode 100644 src/domain/models/RobotsPluginOptions.ts delete mode 100644 src/types.ts diff --git a/src/domain/services/RobotsDataService.ts b/src/data/services/RobotsDataService.ts similarity index 93% rename from src/domain/services/RobotsDataService.ts rename to src/data/services/RobotsDataService.ts index 3c7a475..11ab384 100644 --- a/src/domain/services/RobotsDataService.ts +++ b/src/data/services/RobotsDataService.ts @@ -2,7 +2,8 @@ import robotsParser, { Robot } from 'robots-parser'; import axios from 'axios'; import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants'; import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError'; -import { IRobotsDataService, CachedRobot } from '../../types'; +import { IRobotsDataService } from '../../domain/interfaces/IRobotsDataService'; +import { CachedRobot } from '../../domain/models/CachedRobot'; export class RobotsDataService implements IRobotsDataService { private cache: Map = new Map(); diff --git a/src/domain/interfaces/IAllowService.ts b/src/domain/interfaces/IAllowService.ts new file mode 100644 index 0000000..9d1f030 --- /dev/null +++ b/src/domain/interfaces/IAllowService.ts @@ -0,0 +1,12 @@ +/** + * Service for checking if a URL is allowed to be crawled according to robots.txt rules. + */ +export interface IAllowService { + /** + * Checks if the given URL is allowed for the specified user agent. + * @param url The URL to check. + * @param userAgent The user agent to check against. + * @returns A promise resolving to true if allowed, false otherwise. + */ + isAllowed(url: string, userAgent?: string): Promise; +} diff --git a/src/domain/interfaces/ICrawlDelayService.ts b/src/domain/interfaces/ICrawlDelayService.ts new file mode 100644 index 0000000..f701299 --- /dev/null +++ b/src/domain/interfaces/ICrawlDelayService.ts @@ -0,0 +1,15 @@ +import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; + +/** + * Service for handling Crawl-delay directives from robots.txt. + */ +export interface ICrawlDelayService { + /** + * Enforces the crawl delay for a given URL based on the compliance mode. + * @param url The URL about to be requested. + * @param userAgent The user agent to check rules for. + * @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure). + * @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode). + */ + handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise; +} diff --git a/src/domain/interfaces/IRobotsDataService.ts b/src/domain/interfaces/IRobotsDataService.ts new file mode 100644 index 0000000..21df2de --- /dev/null +++ b/src/domain/interfaces/IRobotsDataService.ts @@ -0,0 +1,21 @@ +import { CachedRobot } from '../models/CachedRobot'; + +/** + * Service for managing robots.txt data and crawl timestamps independently of the protocol logic. + */ +export interface IRobotsDataService { + /** + * Retrieves the cached robot instance for a given URL. + * @param url The URL to get the robot for (used to extract the domain/origin). + * @param userAgent Optional user agent to use for fetching robots.txt if not cached. + * @returns A promise resolving to the CachedRobot containing the parsed rules. + */ + getRobot(url: string, userAgent?: string): Promise; + + /** + * Updates the last crawled timestamp for the domain associated with the URL. + * @param url The URL identifying the domain. + * @param timestamp The timestamp to set. + */ + setLastCrawled(url: string, timestamp: number): void; +} diff --git a/src/domain/models/CachedRobot.ts b/src/domain/models/CachedRobot.ts new file mode 100644 index 0000000..cf1dc81 --- /dev/null +++ b/src/domain/models/CachedRobot.ts @@ -0,0 +1,12 @@ +import { Robot } from 'robots-parser'; + +export interface CachedRobot { + /** + * The parsed robots.txt object. + */ + robot: Robot; + /** + * Timestamp of the last crawl for this domain. + */ + lastCrawled?: number; +} diff --git a/src/domain/models/CrawlDelayComplianceMode.ts b/src/domain/models/CrawlDelayComplianceMode.ts new file mode 100644 index 0000000..5a8763d --- /dev/null +++ b/src/domain/models/CrawlDelayComplianceMode.ts @@ -0,0 +1,17 @@ +/** + * Options for the Robots Exclusion Protocol plugin. + */ +export enum CrawlDelayComplianceMode { + /** + * Respects the Crawl-delay directive by waiting before making the request. + */ + Await = 'await', + /** + * Ignores the Crawl-delay directive. + */ + Ignore = 'ignore', + /** + * Throws an error if the request violates the Crawl-delay. + */ + Failure = 'failure' +} diff --git a/src/domain/models/RobotsPluginOptions.ts b/src/domain/models/RobotsPluginOptions.ts new file mode 100644 index 0000000..7522d11 --- /dev/null +++ b/src/domain/models/RobotsPluginOptions.ts @@ -0,0 +1,13 @@ +import { CrawlDelayComplianceMode } from './CrawlDelayComplianceMode'; + +export interface RobotsPluginOptions { + /** + * The User-Agent string to use when checking robots.txt rules. + */ + userAgent: string; + /** + * How to handle Crawl-delay directives. + * Defaults to CrawlDelayComplianceMode.Await + */ + crawlDelayCompliance?: CrawlDelayComplianceMode; +} diff --git a/src/domain/services/AllowService.ts b/src/domain/services/AllowService.ts index 0da4ec9..dc2c3e8 100644 --- a/src/domain/services/AllowService.ts +++ b/src/domain/services/AllowService.ts @@ -1,4 +1,5 @@ -import { IAllowService, IRobotsDataService } from '../../types'; +import { IAllowService } from '../interfaces/IAllowService'; +import { IRobotsDataService } from '../interfaces/IRobotsDataService'; export class AllowService implements IAllowService { constructor(private dataService: IRobotsDataService) { } diff --git a/src/domain/services/CrawlDelayService.ts b/src/domain/services/CrawlDelayService.ts index 5364113..d2d0e9d 100644 --- a/src/domain/services/CrawlDelayService.ts +++ b/src/domain/services/CrawlDelayService.ts @@ -1,4 +1,6 @@ -import { CrawlDelayComplianceMode, ICrawlDelayService, IRobotsDataService } from '../../types'; +import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; +import { ICrawlDelayService } from '../interfaces/ICrawlDelayService'; +import { IRobotsDataService } from '../interfaces/IRobotsDataService'; import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; import { CrawlDelayStrategyFactory } from '../strategies/CrawlDelayStrategyFactory'; diff --git a/src/domain/strategies/CrawlDelayStrategyFactory.ts b/src/domain/strategies/CrawlDelayStrategyFactory.ts index 1c72c58..0443d44 100644 --- a/src/domain/strategies/CrawlDelayStrategyFactory.ts +++ b/src/domain/strategies/CrawlDelayStrategyFactory.ts @@ -1,5 +1,5 @@ -import { CrawlDelayComplianceMode } from '../../types'; +import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; import { AwaitCrawlDelayStrategy } from './AwaitCrawlDelayStrategy'; diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts index 7d9a2bb..7bbb05a 100644 --- a/src/domain/usecases/CalculateWaitTimeUseCase.ts +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -1,5 +1,5 @@ -import { IRobotsDataService } from '../../types'; +import { IRobotsDataService } from '../interfaces/IRobotsDataService'; export class CalculateWaitTimeUseCase { constructor(private dataService: IRobotsDataService) { } diff --git a/src/index.ts b/src/index.ts index bdd07bc..d914e49 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,13 +1,18 @@ import { AxiosInstance } from 'axios'; import { RobotsInterceptor } from './interceptor'; -import { RobotsPluginOptions } from './types'; +import { RobotsPluginOptions } from './domain/models/RobotsPluginOptions'; -export * from './domain/services/RobotsDataService'; +export * from './data/services/RobotsDataService'; export * from './domain/services/AllowService'; export * from './domain/services/CrawlDelayService'; export * from './errors'; export * from './interceptor'; -export * from './types'; +export * from './domain/models/RobotsPluginOptions'; +export * from './domain/models/CrawlDelayComplianceMode'; +export * from './domain/models/CachedRobot'; +export * from './domain/interfaces/IRobotsDataService'; +export * from './domain/interfaces/IAllowService'; +export * from './domain/interfaces/ICrawlDelayService'; /** * Apply the robots exclusion protocol interceptor to an Axios instance. diff --git a/src/interceptor.ts b/src/interceptor.ts index c27a7f9..38a4797 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,6 +1,10 @@ import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; -import { RobotsPluginOptions, CrawlDelayComplianceMode, IRobotsDataService, IAllowService, ICrawlDelayService } from './types'; -import { RobotsDataService } from './domain/services/RobotsDataService'; +import { RobotsPluginOptions } from './domain/models/RobotsPluginOptions'; +import { CrawlDelayComplianceMode } from './domain/models/CrawlDelayComplianceMode'; +import { IRobotsDataService } from './domain/interfaces/IRobotsDataService'; +import { IAllowService } from './domain/interfaces/IAllowService'; +import { ICrawlDelayService } from './domain/interfaces/ICrawlDelayService'; +import { RobotsDataService } from './data/services/RobotsDataService'; import { AllowService } from './domain/services/AllowService'; import { CrawlDelayService } from './domain/services/CrawlDelayService'; import { InvalidUrlError } from './errors/InvalidUrlError'; diff --git a/src/types.ts b/src/types.ts deleted file mode 100644 index b03040d..0000000 --- a/src/types.ts +++ /dev/null @@ -1,89 +0,0 @@ -import { Robot } from 'robots-parser'; - -/** - * Options for the Robots Exclusion Protocol plugin. - */ -export enum CrawlDelayComplianceMode { - /** - * Respects the Crawl-delay directive by waiting before making the request. - */ - Await = 'await', - /** - * Ignores the Crawl-delay directive. - */ - Ignore = 'ignore', - /** - * Throws an error if the request violates the Crawl-delay. - */ - Failure = 'failure' -} - -export interface RobotsPluginOptions { - /** - * The User-Agent string to use when checking robots.txt rules. - */ - userAgent: string; - /** - * How to handle Crawl-delay directives. - * Defaults to CrawlDelayComplianceMode.Await - */ - crawlDelayCompliance?: CrawlDelayComplianceMode; -} - -export interface CachedRobot { - /** - * The parsed robots.txt object. - */ - robot: Robot; - /** - * Timestamp of the last crawl for this domain. - */ - lastCrawled?: number; -} - -/** - * Service for managing robots.txt data and crawl timestamps independently of the protocol logic. - */ -export interface IRobotsDataService { - /** - * Retrieves the cached robot instance for a given URL. - * @param url The URL to get the robot for (used to extract the domain/origin). - * @param userAgent Optional user agent to use for fetching robots.txt if not cached. - * @returns A promise resolving to the CachedRobot containing the parsed rules. - */ - getRobot(url: string, userAgent?: string): Promise; - - /** - * Updates the last crawled timestamp for the domain associated with the URL. - * @param url The URL identifying the domain. - * @param timestamp The timestamp to set. - */ - setLastCrawled(url: string, timestamp: number): void; -} - -/** - * Service for checking if a URL is allowed to be crawled according to robots.txt rules. - */ -export interface IAllowService { - /** - * Checks if the given URL is allowed for the specified user agent. - * @param url The URL to check. - * @param userAgent The user agent to check against. - * @returns A promise resolving to true if allowed, false otherwise. - */ - isAllowed(url: string, userAgent?: string): Promise; -} - -/** - * Service for handling Crawl-delay directives from robots.txt. - */ -export interface ICrawlDelayService { - /** - * Enforces the crawl delay for a given URL based on the compliance mode. - * @param url The URL about to be requested. - * @param userAgent The user agent to check rules for. - * @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure). - * @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode). - */ - handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise; -} diff --git a/tests/crawl-delay.test.ts b/tests/crawl-delay.test.ts index ea746a2..458dd00 100644 --- a/tests/crawl-delay.test.ts +++ b/tests/crawl-delay.test.ts @@ -1,7 +1,6 @@ import axios from 'axios'; import nock from 'nock'; -import { applyRobotsInterceptor } from '../src/index'; -import { CrawlDelayComplianceMode } from '../src/types'; +import { applyRobotsInterceptor, CrawlDelayComplianceMode } from '../src/index'; describe('Crawl-delay Compliance', () => { let client: ReturnType; From e4041682430a3c26cafaaf4986b26c7561346efe Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:06:07 +0000 Subject: [PATCH 08/12] remove redundant comments --- src/data/services/RobotsDataService.ts | 7 ------- src/domain/services/AllowService.ts | 4 ---- src/domain/services/CrawlDelayService.ts | 4 ---- src/domain/strategies/ICrawlDelayStrategy.ts | 8 ++++++++ 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/data/services/RobotsDataService.ts b/src/data/services/RobotsDataService.ts index 11ab384..131708d 100644 --- a/src/data/services/RobotsDataService.ts +++ b/src/data/services/RobotsDataService.ts @@ -8,10 +8,6 @@ import { CachedRobot } from '../../domain/models/CachedRobot'; export class RobotsDataService implements IRobotsDataService { private cache: Map = new Map(); - /** - * Retrieves the cached robot rules for the given URL's origin. - * Fetches from the network if not already cached. - */ async getRobot(url: string, userAgent: string = '*'): Promise { const origin = new URL(url).origin; let cached = this.cache.get(origin); @@ -26,9 +22,6 @@ export class RobotsDataService implements IRobotsDataService { return cached; } - /** - * Updates the last crawled timestamp for the given URL's origin. - */ setLastCrawled(url: string, timestamp: number): void { const origin = new URL(url).origin; const cached = this.cache.get(origin); diff --git a/src/domain/services/AllowService.ts b/src/domain/services/AllowService.ts index dc2c3e8..1eda14e 100644 --- a/src/domain/services/AllowService.ts +++ b/src/domain/services/AllowService.ts @@ -4,10 +4,6 @@ import { IRobotsDataService } from '../interfaces/IRobotsDataService'; export class AllowService implements IAllowService { constructor(private dataService: IRobotsDataService) { } - /** - * Checks if the given URL is allowed for the specified User-Agent. - * Fetching and caching the robots.txt is handled automatically by the data service. - */ async isAllowed(url: string, userAgent: string = '*'): Promise { const robot = await this.dataService.getRobot(url, userAgent); diff --git a/src/domain/services/CrawlDelayService.ts b/src/domain/services/CrawlDelayService.ts index d2d0e9d..2e83df7 100644 --- a/src/domain/services/CrawlDelayService.ts +++ b/src/domain/services/CrawlDelayService.ts @@ -13,10 +13,6 @@ export class CrawlDelayService implements ICrawlDelayService { this.strategyFactory = new CrawlDelayStrategyFactory(this.calculateWaitTimeUseCase); } - /** - * Handles the crawl delay for the given URL and user agent. - * Enforces the delay based on the compliance mode. - */ async handleCrawlDelay( url: string, userAgent: string, diff --git a/src/domain/strategies/ICrawlDelayStrategy.ts b/src/domain/strategies/ICrawlDelayStrategy.ts index 41f299b..12c0677 100644 --- a/src/domain/strategies/ICrawlDelayStrategy.ts +++ b/src/domain/strategies/ICrawlDelayStrategy.ts @@ -1,4 +1,12 @@ +/** + * Strategy for ensuring compliance with Crawl-delay rules. + */ export interface ICrawlDelayStrategy { + /** + * Executes the strategy for a given URL and user agent. + * @param url The URL about to be crawled. + * @param userAgent The user agent for which to check the rules. + */ execute(url: string, userAgent: string): Promise; } From dd08a0a8c36c78641ae3dd7bdb849b7a12ad5de3 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:09:55 +0000 Subject: [PATCH 09/12] rename service to repository --- .../RobotsDataRepository.ts} | 4 ++-- ...{IRobotsDataService.ts => IRobotsDataRepository.ts} | 4 ++-- src/domain/services/AllowService.ts | 4 ++-- src/domain/services/CrawlDelayService.ts | 4 ++-- src/domain/usecases/CalculateWaitTimeUseCase.ts | 4 ++-- src/index.ts | 4 ++-- src/interceptor.ts | 10 +++++----- 7 files changed, 17 insertions(+), 17 deletions(-) rename src/data/{services/RobotsDataService.ts => repositories/RobotsDataRepository.ts} (92%) rename src/domain/interfaces/{IRobotsDataService.ts => IRobotsDataRepository.ts} (83%) diff --git a/src/data/services/RobotsDataService.ts b/src/data/repositories/RobotsDataRepository.ts similarity index 92% rename from src/data/services/RobotsDataService.ts rename to src/data/repositories/RobotsDataRepository.ts index 131708d..4bd0ac7 100644 --- a/src/data/services/RobotsDataService.ts +++ b/src/data/repositories/RobotsDataRepository.ts @@ -2,10 +2,10 @@ import robotsParser, { Robot } from 'robots-parser'; import axios from 'axios'; import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants'; import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError'; -import { IRobotsDataService } from '../../domain/interfaces/IRobotsDataService'; +import { IRobotsDataRepository } from '../../domain/interfaces/IRobotsDataRepository'; import { CachedRobot } from '../../domain/models/CachedRobot'; -export class RobotsDataService implements IRobotsDataService { +export class RobotsDataRepository implements IRobotsDataRepository { private cache: Map = new Map(); async getRobot(url: string, userAgent: string = '*'): Promise { diff --git a/src/domain/interfaces/IRobotsDataService.ts b/src/domain/interfaces/IRobotsDataRepository.ts similarity index 83% rename from src/domain/interfaces/IRobotsDataService.ts rename to src/domain/interfaces/IRobotsDataRepository.ts index 21df2de..851d534 100644 --- a/src/domain/interfaces/IRobotsDataService.ts +++ b/src/domain/interfaces/IRobotsDataRepository.ts @@ -1,9 +1,9 @@ import { CachedRobot } from '../models/CachedRobot'; /** - * Service for managing robots.txt data and crawl timestamps independently of the protocol logic. + * Repository for managing robots.txt data and crawl timestamps independently of the protocol logic. */ -export interface IRobotsDataService { +export interface IRobotsDataRepository { /** * Retrieves the cached robot instance for a given URL. * @param url The URL to get the robot for (used to extract the domain/origin). diff --git a/src/domain/services/AllowService.ts b/src/domain/services/AllowService.ts index 1eda14e..580b40b 100644 --- a/src/domain/services/AllowService.ts +++ b/src/domain/services/AllowService.ts @@ -1,8 +1,8 @@ import { IAllowService } from '../interfaces/IAllowService'; -import { IRobotsDataService } from '../interfaces/IRobotsDataService'; +import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository'; export class AllowService implements IAllowService { - constructor(private dataService: IRobotsDataService) { } + constructor(private dataService: IRobotsDataRepository) { } async isAllowed(url: string, userAgent: string = '*'): Promise { const robot = await this.dataService.getRobot(url, userAgent); diff --git a/src/domain/services/CrawlDelayService.ts b/src/domain/services/CrawlDelayService.ts index 2e83df7..3753912 100644 --- a/src/domain/services/CrawlDelayService.ts +++ b/src/domain/services/CrawlDelayService.ts @@ -1,6 +1,6 @@ import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; import { ICrawlDelayService } from '../interfaces/ICrawlDelayService'; -import { IRobotsDataService } from '../interfaces/IRobotsDataService'; +import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository'; import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; import { CrawlDelayStrategyFactory } from '../strategies/CrawlDelayStrategyFactory'; @@ -8,7 +8,7 @@ export class CrawlDelayService implements ICrawlDelayService { private calculateWaitTimeUseCase: CalculateWaitTimeUseCase; private strategyFactory: CrawlDelayStrategyFactory; - constructor(private dataService: IRobotsDataService) { + constructor(private dataService: IRobotsDataRepository) { this.calculateWaitTimeUseCase = new CalculateWaitTimeUseCase(dataService); this.strategyFactory = new CrawlDelayStrategyFactory(this.calculateWaitTimeUseCase); } diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts index 7bbb05a..9d0f640 100644 --- a/src/domain/usecases/CalculateWaitTimeUseCase.ts +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -1,8 +1,8 @@ -import { IRobotsDataService } from '../interfaces/IRobotsDataService'; +import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository'; export class CalculateWaitTimeUseCase { - constructor(private dataService: IRobotsDataService) { } + constructor(private dataService: IRobotsDataRepository) { } async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> { const cachedRobot = await this.dataService.getRobot(url, userAgent); diff --git a/src/index.ts b/src/index.ts index d914e49..004ed75 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,7 +2,7 @@ import { AxiosInstance } from 'axios'; import { RobotsInterceptor } from './interceptor'; import { RobotsPluginOptions } from './domain/models/RobotsPluginOptions'; -export * from './data/services/RobotsDataService'; +export * from './data/repositories/RobotsDataRepository'; export * from './domain/services/AllowService'; export * from './domain/services/CrawlDelayService'; export * from './errors'; @@ -10,7 +10,7 @@ export * from './interceptor'; export * from './domain/models/RobotsPluginOptions'; export * from './domain/models/CrawlDelayComplianceMode'; export * from './domain/models/CachedRobot'; -export * from './domain/interfaces/IRobotsDataService'; +export * from './domain/interfaces/IRobotsDataRepository'; export * from './domain/interfaces/IAllowService'; export * from './domain/interfaces/ICrawlDelayService'; diff --git a/src/interceptor.ts b/src/interceptor.ts index 38a4797..f6b09e8 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,10 +1,10 @@ import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; import { RobotsPluginOptions } from './domain/models/RobotsPluginOptions'; import { CrawlDelayComplianceMode } from './domain/models/CrawlDelayComplianceMode'; -import { IRobotsDataService } from './domain/interfaces/IRobotsDataService'; +import { IRobotsDataRepository } from './domain/interfaces/IRobotsDataRepository'; import { IAllowService } from './domain/interfaces/IAllowService'; import { ICrawlDelayService } from './domain/interfaces/ICrawlDelayService'; -import { RobotsDataService } from './data/services/RobotsDataService'; +import { RobotsDataRepository } from './data/repositories/RobotsDataRepository'; import { AllowService } from './domain/services/AllowService'; import { CrawlDelayService } from './domain/services/CrawlDelayService'; import { InvalidUrlError } from './errors/InvalidUrlError'; @@ -13,7 +13,7 @@ import { RobotsDeniedError } from './errors/RobotsDeniedError'; import { HEADER_USER_AGENT, PROTOCOL_HTTP, PROTOCOL_HTTPS } from './constants'; export class RobotsInterceptor { - private dataService: IRobotsDataService; + private dataService: IRobotsDataRepository; private allowService: IAllowService; private crawlDelayService: ICrawlDelayService; private userAgent: string; @@ -22,7 +22,7 @@ export class RobotsInterceptor { constructor( options: RobotsPluginOptions, deps?: { - dataService?: IRobotsDataService, + dataService?: IRobotsDataRepository, allowService?: IAllowService, crawlDelayService?: ICrawlDelayService; } @@ -30,7 +30,7 @@ export class RobotsInterceptor { this.userAgent = options.userAgent; this.crawlDelayCompliance = options.crawlDelayCompliance ?? CrawlDelayComplianceMode.Await; - this.dataService = deps?.dataService ?? new RobotsDataService(); + this.dataService = deps?.dataService ?? new RobotsDataRepository(); this.allowService = deps?.allowService ?? new AllowService(this.dataService); this.crawlDelayService = deps?.crawlDelayService ?? new CrawlDelayService(this.dataService); } From a613ea5c4e12cd75d54aa15fa48f7ecf09ea3cfa Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:12:57 +0000 Subject: [PATCH 10/12] bump version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index da769c7..715fd8a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "axios-robots", - "version": "0.1.0", + "version": "0.2.0", "description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots", "main": "dist/index.js", "types": "dist/index.d.ts", From 3068cac9a0451ce61c9c6a4063dff24b7c5a0083 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:22:02 +0000 Subject: [PATCH 11/12] refactor tests --- tests/{ => integration}/crawl-delay.test.ts | 31 +++++--- .../interceptor.test.ts} | 72 +++++++++++++++---- 2 files changed, 79 insertions(+), 24 deletions(-) rename tests/{ => integration}/crawl-delay.test.ts (80%) rename tests/{index.test.ts => integration/interceptor.test.ts} (68%) diff --git a/tests/crawl-delay.test.ts b/tests/integration/crawl-delay.test.ts similarity index 80% rename from tests/crawl-delay.test.ts rename to tests/integration/crawl-delay.test.ts index 458dd00..f3ff404 100644 --- a/tests/crawl-delay.test.ts +++ b/tests/integration/crawl-delay.test.ts @@ -1,6 +1,6 @@ import axios from 'axios'; import nock from 'nock'; -import { applyRobotsInterceptor, CrawlDelayComplianceMode } from '../src/index'; +import { applyRobotsInterceptor, CrawlDelayComplianceMode } from '../../src/index'; describe('Crawl-delay Compliance', () => { let client: ReturnType; @@ -10,7 +10,6 @@ describe('Crawl-delay Compliance', () => { beforeEach(() => { nock.cleanAll(); client = axios.create(); - // Exclude nextTick and setImmediate to prevent hanging nock/axios promises jest.useFakeTimers({ doNotFake: ['nextTick', 'setImmediate'] }); @@ -24,7 +23,11 @@ describe('Crawl-delay Compliance', () => { [1, 1000], [2, 2000], [3, 3000] - ])('GIVEN a robots.txt with Crawl-delay: %i WHEN making consecutive requests THEN the second request should wait at least %i ms', async (delaySeconds, expectedDelayMs) => { + ])(` +GIVEN a robots.txt with Crawl-delay: %i +WHEN making consecutive requests +THEN the second request should wait at least %i ms + `, async (delaySeconds, expectedDelayMs) => { applyRobotsInterceptor(client, { userAgent: USER_AGENT, crawlDelayCompliance: CrawlDelayComplianceMode.Await @@ -46,7 +49,6 @@ describe('Crawl-delay Compliance', () => { const requestPromise = client.get(`${DOMAIN}/two`); - // Fast-forward time to simulate the delay jest.advanceTimersByTime(expectedDelayMs); await requestPromise; @@ -56,7 +58,11 @@ describe('Crawl-delay Compliance', () => { expect(duration).toBeGreaterThanOrEqual(expectedDelayMs); }); - test('GIVEN a request fails WHEN making a subsequent request THEN it should still respect the Crawl-delay', async () => { + test(` +GIVEN a request fails +WHEN making a subsequent request +THEN it should still respect the Crawl-delay + `, async () => { applyRobotsInterceptor(client, { userAgent: USER_AGENT, crawlDelayCompliance: CrawlDelayComplianceMode.Await @@ -73,17 +79,14 @@ describe('Crawl-delay Compliance', () => { nock(DOMAIN).get('/fail').reply(500, 'Server Error'); nock(DOMAIN).get('/success').reply(200, 'Success'); - // First request fails try { await client.get(`${DOMAIN}/fail`); } catch (e) { - // Expected error } const afterFail = Date.now(); const requestPromise = client.get(`${DOMAIN}/success`); - // Fast-forward time to simulate the delay (2000ms) jest.advanceTimersByTime(2000); await requestPromise; @@ -93,7 +96,11 @@ describe('Crawl-delay Compliance', () => { expect(duration).toBeGreaterThanOrEqual(2000); }); - test('GIVEN crawlDelayCompliance is Ignore WHEN making consecutive requests THEN the second request should NOT wait', async () => { + test(` +GIVEN crawlDelayCompliance is Ignore +WHEN making consecutive requests +THEN the second request should NOT wait + `, async () => { applyRobotsInterceptor(client, { userAgent: USER_AGENT, crawlDelayCompliance: CrawlDelayComplianceMode.Ignore @@ -119,7 +126,11 @@ describe('Crawl-delay Compliance', () => { expect(duration).toBeLessThan(1000); }); - test('GIVEN crawlDelayCompliance is Failure WHEN making consecutive requests THEN the second request should throw', async () => { + test(` +GIVEN crawlDelayCompliance is Failure +WHEN making consecutive requests +THEN the second request should throw + `, async () => { applyRobotsInterceptor(client, { userAgent: USER_AGENT, crawlDelayCompliance: CrawlDelayComplianceMode.Failure diff --git a/tests/index.test.ts b/tests/integration/interceptor.test.ts similarity index 68% rename from tests/index.test.ts rename to tests/integration/interceptor.test.ts index 5504ff6..29179af 100644 --- a/tests/index.test.ts +++ b/tests/integration/interceptor.test.ts @@ -1,8 +1,8 @@ import axios from 'axios'; import nock from 'nock'; -import { applyRobotsInterceptor } from '../src/index'; -import { RobotsError } from '../src/errors/RobotsError'; -import { HEADER_USER_AGENT } from '../src/constants'; +import { applyRobotsInterceptor } from '../../src/index'; +import { RobotsError } from '../../src/errors/RobotsError'; +import { HEADER_USER_AGENT } from '../../src/constants'; describe('Axios Robots Interceptor', () => { let client: ReturnType; @@ -16,7 +16,11 @@ describe('Axios Robots Interceptor', () => { }); describe('RFC Compliance: Access Rules', () => { - test('GIVEN a robots.txt with a specific Disallow rule WHEN the bot requests a matching path THEN it should throw a RobotsError', async () => { + test(` +GIVEN a robots.txt with a specific Disallow rule +WHEN the bot requests a matching path +THEN it should throw a RobotsError + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -29,7 +33,11 @@ describe('Axios Robots Interceptor', () => { await expect(client.get(`${DOMAIN}/private`)).rejects.toThrow(RobotsError); }); - test('GIVEN a robots.txt with a specific Allow rule WHEN the bot requests a matching path THEN it should allow the request', async () => { + test(` +GIVEN a robots.txt with a specific Allow rule +WHEN the bot requests a matching path +THEN it should allow the request + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -45,7 +53,11 @@ describe('Axios Robots Interceptor', () => { expect(response.data).toBe('Public Data'); }); - test('GIVEN a robots.txt with a wildcard Disallow rule WHEN the bot requests a matching file THEN it should throw a RobotsError', async () => { + test(` +GIVEN a robots.txt with a wildcard Disallow rule +WHEN the bot requests a matching file +THEN it should throw a RobotsError + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -56,7 +68,11 @@ describe('Axios Robots Interceptor', () => { }); describe('RFC Compliance: User-Agent Matching', () => { - test('GIVEN a robots.txt with specific rules for TestBot WHEN TestBot requests a URL THEN it should follow the specific rules', async () => { + test(` +GIVEN a robots.txt with specific rules for TestBot +WHEN TestBot requests a URL +THEN it should follow the specific rules + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -76,7 +92,11 @@ describe('Axios Robots Interceptor', () => { }); describe('RFC Compliance: Status Codes (Availability)', () => { - test('GIVEN the robots.txt endpoint returns 404 (Not Found) WHEN a request is made THEN it should allow access', async () => { + test(` +GIVEN the robots.txt endpoint returns 404 (Not Found) +WHEN a request is made +THEN it should allow access + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -91,7 +111,11 @@ describe('Axios Robots Interceptor', () => { expect(response.status).toBe(200); }); - test('GIVEN the robots.txt endpoint returns 403 (Forbidden) WHEN a request is made THEN it should allow access (Unavailable = Allow)', async () => { + test(` +GIVEN the robots.txt endpoint returns 403 (Forbidden) +WHEN a request is made +THEN it should allow access (Unavailable = Allow) + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -106,7 +130,11 @@ describe('Axios Robots Interceptor', () => { expect(response.status).toBe(200); }); - test('GIVEN the robots.txt endpoint returns 500 (Internal Server Error) WHEN a request is made THEN it should throw a RobotsError (Unreachable = Disallow)', async () => { + test(` +GIVEN the robots.txt endpoint returns 500 (Internal Server Error) +WHEN a request is made +THEN it should throw a RobotsError (Unreachable = Disallow) + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -117,17 +145,29 @@ describe('Axios Robots Interceptor', () => { }); describe('Interceptor Logic & Safety', () => { - test('GIVEN an invalid URL WHEN a request is made THEN it should throw a RobotsError', async () => { + test(` +GIVEN an invalid URL +WHEN a request is made +THEN it should throw a RobotsError + `, async () => { await expect(client.get('not-a-url')).rejects.toThrow(/Invalid URL/); }); - test('GIVEN a non-HTTP protocol WHEN a request is made THEN it should throw a RobotsError', async () => { + test(` +GIVEN a non-HTTP protocol +WHEN a request is made +THEN it should throw a RobotsError + `, async () => { await expect(client.get('ftp://example.com/file')).rejects.toThrow(/Invalid protocol/); }); - test('GIVEN a valid config WHEN fetching robots.txt THEN it should send the configured User-Agent header', async () => { + test(` +GIVEN a valid config +WHEN fetching robots.txt +THEN it should send the configured User-Agent header + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -145,7 +185,11 @@ describe('Axios Robots Interceptor', () => { }); describe('Caching', () => { - test('GIVEN a cached robots.txt WHEN making a second request to the same origin THEN it should not make a second network request for robots.txt', async () => { + test(` +GIVEN a cached robots.txt +WHEN making a second request to the same origin +THEN it should not make a second network request for robots.txt + `, async () => { const scope = nock(DOMAIN) .get('/robots.txt') From ceedcffa30afe9682efc74d6d73b0022966c0c57 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:22:29 +0000 Subject: [PATCH 12/12] add unit tests --- .../unit/domain/services/AllowService.test.ts | 86 +++++++++++++ .../AwaitCrawlDelayStrategy.test.ts | 52 ++++++++ .../CrawlDelayStrategyFactory.test.ts | 43 +++++++ .../FailureCrawlDelayStrategy.test.ts | 35 ++++++ .../IgnoreCrawlDelayStrategy.test.ts | 17 +++ .../usecases/CalculateWaitTimeUseCase.test.ts | 115 ++++++++++++++++++ 6 files changed, 348 insertions(+) create mode 100644 tests/unit/domain/services/AllowService.test.ts create mode 100644 tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts create mode 100644 tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts create mode 100644 tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts create mode 100644 tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts create mode 100644 tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts diff --git a/tests/unit/domain/services/AllowService.test.ts b/tests/unit/domain/services/AllowService.test.ts new file mode 100644 index 0000000..0f63fa9 --- /dev/null +++ b/tests/unit/domain/services/AllowService.test.ts @@ -0,0 +1,86 @@ +import { AllowService } from '../../../../src/domain/services/AllowService'; +import { IRobotsDataRepository } from '../../../../src/domain/interfaces/IRobotsDataRepository'; +import { CachedRobot } from '../../../../src/domain/models/CachedRobot'; + +describe('AllowService', () => { + let service: AllowService; + let mockDataRepository: jest.Mocked; + + beforeEach(() => { + mockDataRepository = { + getRobot: jest.fn(), + setLastCrawled: jest.fn(), + }; + service = new AllowService(mockDataRepository); + }); + + test(` +GIVEN no robot data is found +WHEN checking if a URL is allowed +THEN it should return true (default allow) + `, async () => { + mockDataRepository.getRobot.mockResolvedValue(null as any); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + }); + + test(` +GIVEN robot data exists but has no robot object +WHEN checking if a URL is allowed +THEN it should return true + `, async () => { + mockDataRepository.getRobot.mockResolvedValue({ robot: null } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + }); + + test(` +GIVEN robot rules exist and allow the URL +WHEN checking if a URL is allowed +THEN it should return true + `, async () => { + const mockRobot = { + isAllowed: jest.fn().mockReturnValue(true) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + expect(mockRobot.isAllowed).toHaveBeenCalledWith('https://example.com/foo', '*'); + }); + + test(` +GIVEN robot rules exist and disallow the URL +WHEN checking if a URL is allowed +THEN it should return false + `, async () => { + const mockRobot = { + isAllowed: jest.fn().mockReturnValue(false) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/private'); + + expect(result).toBe(false); + }); + + test(` +GIVEN robot rules exist but isAllowed returns undefined +WHEN checking if a URL is allowed +THEN it should return true (default to allowed) + `, async () => { + const mockRobot = { + isAllowed: jest.fn().mockReturnValue(undefined) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + }); +}); diff --git a/tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts b/tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts new file mode 100644 index 0000000..d8a8a7f --- /dev/null +++ b/tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts @@ -0,0 +1,52 @@ +import { AwaitCrawlDelayStrategy } from '../../../../src/domain/strategies/AwaitCrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; + +describe('AwaitCrawlDelayStrategy', () => { + let strategy: AwaitCrawlDelayStrategy; + let mockUseCase: jest.Mocked; + + beforeEach(() => { + mockUseCase = { + execute: jest.fn(), + } as unknown as jest.Mocked; + strategy = new AwaitCrawlDelayStrategy(mockUseCase); + jest.useFakeTimers({ + doNotFake: ['nextTick', 'setImmediate'] + }); + }); + + afterEach(() => { + jest.useRealTimers(); + }); + + test(` +GIVEN wait time is 0 +WHEN executing strategy +THEN it should return immediately without waiting + `, async () => { + mockUseCase.execute.mockResolvedValue({ waitTime: 0, delay: 5 }); + const setTimeoutSpy = jest.spyOn(global, 'setTimeout'); + + await strategy.execute('https://example.com', '*'); + + expect(setTimeoutSpy).not.toHaveBeenCalled(); + }); + + test(` +GIVEN wait time is greater than 0 +WHEN executing strategy +THEN it should wait for the specified time + `, async () => { + const waitTime = 1000; + mockUseCase.execute.mockResolvedValue({ waitTime, delay: 5 }); + + const executePromise = strategy.execute('https://example.com', '*'); + + await Promise.resolve(); + await Promise.resolve(); + + jest.advanceTimersByTime(waitTime); + + await executePromise; + }); +}); diff --git a/tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts b/tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts new file mode 100644 index 0000000..7d6602c --- /dev/null +++ b/tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts @@ -0,0 +1,43 @@ +import { CrawlDelayStrategyFactory } from '../../../../src/domain/strategies/CrawlDelayStrategyFactory'; +import { AwaitCrawlDelayStrategy } from '../../../../src/domain/strategies/AwaitCrawlDelayStrategy'; +import { FailureCrawlDelayStrategy } from '../../../../src/domain/strategies/FailureCrawlDelayStrategy'; +import { IgnoreCrawlDelayStrategy } from '../../../../src/domain/strategies/IgnoreCrawlDelayStrategy'; +import { CrawlDelayComplianceMode } from '../../../../src/domain/models/CrawlDelayComplianceMode'; +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; + +describe('CrawlDelayStrategyFactory', () => { + let factory: CrawlDelayStrategyFactory; + let mockUseCase: CalculateWaitTimeUseCase; + + beforeEach(() => { + mockUseCase = {} as CalculateWaitTimeUseCase; + factory = new CrawlDelayStrategyFactory(mockUseCase); + }); + + test(` +GIVEN mode is Await +WHEN getting strategy +THEN it should return AwaitCrawlDelayStrategy + `, () => { + const strategy = factory.getStrategy(CrawlDelayComplianceMode.Await); + expect(strategy).toBeInstanceOf(AwaitCrawlDelayStrategy); + }); + + test(` +GIVEN mode is Failure +WHEN getting strategy +THEN it should return FailureCrawlDelayStrategy + `, () => { + const strategy = factory.getStrategy(CrawlDelayComplianceMode.Failure); + expect(strategy).toBeInstanceOf(FailureCrawlDelayStrategy); + }); + + test(` +GIVEN mode is Ignore +WHEN getting strategy +THEN it should return IgnoreCrawlDelayStrategy + `, () => { + const strategy = factory.getStrategy(CrawlDelayComplianceMode.Ignore); + expect(strategy).toBeInstanceOf(IgnoreCrawlDelayStrategy); + }); +}); diff --git a/tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts b/tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts new file mode 100644 index 0000000..eddbbad --- /dev/null +++ b/tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts @@ -0,0 +1,35 @@ +import { FailureCrawlDelayStrategy } from '../../../../src/domain/strategies/FailureCrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; +import { CrawlDelayError } from '../../../../src/errors/CrawlDelayError'; + +describe('FailureCrawlDelayStrategy', () => { + let strategy: FailureCrawlDelayStrategy; + let mockUseCase: jest.Mocked; + + beforeEach(() => { + mockUseCase = { + execute: jest.fn(), + } as unknown as jest.Mocked; + strategy = new FailureCrawlDelayStrategy(mockUseCase); + }); + + test(` +GIVEN wait time is 0 +WHEN executing strategy +THEN it should return successfully + `, async () => { + mockUseCase.execute.mockResolvedValue({ waitTime: 0, delay: 5 }); + + await expect(strategy.execute('https://example.com', '*')).resolves.not.toThrow(); + }); + + test(` +GIVEN wait time is greater than 0 +WHEN executing strategy +THEN it should throw a CrawlDelayError + `, async () => { + mockUseCase.execute.mockResolvedValue({ waitTime: 1000, delay: 5 }); + + await expect(strategy.execute('https://example.com', '*')).rejects.toThrow(CrawlDelayError); + }); +}); diff --git a/tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts b/tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts new file mode 100644 index 0000000..351e4d5 --- /dev/null +++ b/tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts @@ -0,0 +1,17 @@ +import { IgnoreCrawlDelayStrategy } from '../../../../src/domain/strategies/IgnoreCrawlDelayStrategy'; + +describe('IgnoreCrawlDelayStrategy', () => { + let strategy: IgnoreCrawlDelayStrategy; + + beforeEach(() => { + strategy = new IgnoreCrawlDelayStrategy(); + }); + + test(` +GIVEN any conditions +WHEN executing strategy +THEN it should return successfully (no op) + `, async () => { + await expect(strategy.execute('https://example.com', '*')).resolves.not.toThrow(); + }); +}); diff --git a/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts new file mode 100644 index 0000000..215dd1f --- /dev/null +++ b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts @@ -0,0 +1,115 @@ +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; +import { IRobotsDataRepository } from '../../../../src/domain/interfaces/IRobotsDataRepository'; +import { CachedRobot } from '../../../../src/domain/models/CachedRobot'; + +describe('CalculateWaitTimeUseCase', () => { + let useCase: CalculateWaitTimeUseCase; + let mockDataRepository: jest.Mocked; + + beforeEach(() => { + mockDataRepository = { + getRobot: jest.fn(), + setLastCrawled: jest.fn(), + }; + useCase = new CalculateWaitTimeUseCase(mockDataRepository); + }); + + test(` +GIVEN no robot data +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + mockDataRepository.getRobot.mockResolvedValue(null as any); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result).toEqual({ waitTime: 0, delay: 0 }); + }); + + test(` +GIVEN robot data with no crawl delay +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(undefined) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result).toEqual({ waitTime: 0, delay: 0 }); + }); + + test(` +GIVEN robot data with crawl delay but never crawled before +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(5) + }; + mockDataRepository.getRobot.mockResolvedValue({ + robot: mockRobot, + lastCrawled: undefined + } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result).toEqual({ waitTime: 0, delay: 0 }); + }); + + test(` +GIVEN robot data with crawl delay and previously crawled recently +WHEN calculating wait time +THEN it should return the remaining wait time + `, async () => { + const delaySeconds = 2; + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) + }; + + // Crawled 1000ms ago, delay is 2000ms, should wait 1000ms + const now = Date.now(); + const lastCrawled = now - 1000; + + jest.spyOn(Date, 'now').mockReturnValue(now); + + mockDataRepository.getRobot.mockResolvedValue({ + robot: mockRobot, + lastCrawled: lastCrawled + } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result.waitTime).toBe(1000); + expect(result.delay).toBe(delaySeconds); + }); + + test(` +GIVEN robot data with crawl delay and previously crawled long ago +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + const delaySeconds = 2; + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) + }; + + // Crawled 3000ms ago, delay is 2000ms, should wait 0ms + const now = Date.now(); + const lastCrawled = now - 3000; + + jest.spyOn(Date, 'now').mockReturnValue(now); + + mockDataRepository.getRobot.mockResolvedValue({ + robot: mockRobot, + lastCrawled: lastCrawled + } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result.waitTime).toBe(0); + expect(result.delay).toBe(delaySeconds); + }); +});