diff --git a/README.md b/README.md index 9288beb..3f0bc20 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Ensures your bot plays by the rules defined by website owners, preventing unauth ## Features - **🚀 Automated Compliance**: Validates every request against `robots.txt` rules (cached per origin). +- **⏱️ Crawl-Delay**: Option to automatically wait before requests if `Crawl-delay` is specified. - **🛡️ Strict Mode**: invalid URLs, non-HTTP/S protocols, or unreachable `robots.txt` files (non-4xx error) block requests by default. - **✨ Clean Architecture**: built with maintainability and separation of concerns in mind. - **🔌 Plug-and-Play**: easily attaches to any Axios instance. @@ -43,7 +44,7 @@ const client = axios.create(); // Apply the interceptor applyRobotsInterceptor(client, { - userAgent: 'MyCoolBot/1.0' + userAgent: 'MyCoolBot/1.0', }); async function crawl() { @@ -81,6 +82,13 @@ Attaches the interceptor to the provided Axios instance. ```typescript interface RobotsPluginOptions { userAgent: string; + crawlDelayCompliance?: CrawlDelayComplianceMode; // default: CrawlDelayComplianceMode.Await +} + +enum CrawlDelayComplianceMode { + Await = 'await', // Respects delay by waiting + Ignore = 'ignore', // Ignores delay + Failure = 'failure' // Throws Error if delay is not met } ``` @@ -111,9 +119,9 @@ The interceptor throws a `RobotsError` in the following cases: - [x] **[RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html) Compliance**: Full support for the standard Robots Exclusion Protocol. - [x] **Standard Directives**: Supports `User-agent`, `Allow`, and `Disallow`. - [x] **Wildcards**: Supports standard path matching including `*` and `$`. +- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured. ### 🚧 Missing / TODO -- [ ] **Crawl-delay**: The interceptor currently does **not** enforce `Crawl-delay` directives (automatic throttling). - [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer. - [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance. diff --git a/package.json b/package.json index da769c7..715fd8a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "axios-robots", - "version": "0.1.0", + "version": "0.2.0", "description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/src/data/repositories/RobotsDataRepository.ts b/src/data/repositories/RobotsDataRepository.ts new file mode 100644 index 0000000..4bd0ac7 --- /dev/null +++ b/src/data/repositories/RobotsDataRepository.ts @@ -0,0 +1,57 @@ +import robotsParser, { Robot } from 'robots-parser'; +import axios from 'axios'; +import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants'; +import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError'; +import { IRobotsDataRepository } from '../../domain/interfaces/IRobotsDataRepository'; +import { CachedRobot } from '../../domain/models/CachedRobot'; + +export class RobotsDataRepository implements IRobotsDataRepository { + private cache: Map = new Map(); + + async getRobot(url: string, userAgent: string = '*'): Promise { + const origin = new URL(url).origin; + let cached = this.cache.get(origin); + + if (cached) + return cached; + + const robot = await this.fetchRobotsTxt(origin, userAgent); + cached = { robot }; + this.cache.set(origin, cached); + + return cached; + } + + setLastCrawled(url: string, timestamp: number): void { + const origin = new URL(url).origin; + const cached = this.cache.get(origin); + if (cached) { + cached.lastCrawled = timestamp; + } + } + + private async fetchRobotsTxt(origin: string, userAgent: string): Promise { + const robotsUrl = `${origin}/${ROBOTS_TXT_FILENAME}`; + + const internalClient = axios.create({ + headers: { + [HEADER_USER_AGENT]: userAgent, + } + }); + + try { + const response = await internalClient.get(robotsUrl); + return robotsParser(robotsUrl, response.data); + } catch (error: any) { + if (this.isUnavailable(error)) { + return robotsParser(robotsUrl, ALLOW_ALL_ROBOTS_TXT_CONTENT); + } + + throw new RobotsUnreachableError(error.message); + } + } + + private isUnavailable(error: any): boolean { + return error.response && error.response.status >= 400 && error.response.status < 500; + } +} diff --git a/src/domain/RobotsService.ts b/src/domain/RobotsService.ts deleted file mode 100644 index fbbeb24..0000000 --- a/src/domain/RobotsService.ts +++ /dev/null @@ -1,52 +0,0 @@ -import robotsParser, { Robot } from 'robots-parser'; -import axios from 'axios'; -import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../constants'; -import { RobotsError } from '../errors/RobotsError'; -import { ERROR_MESSAGES } from '../errors/messages'; - -import { IRobotsService } from '../types'; - -export class RobotsService implements IRobotsService { - private cache: Map = new Map(); - - /** - * Checks if the given URL is allowed for the specified User-Agent. - * Fetching and caching the robots.txt is handled automatically. - */ - async isAllowed(url: string, userAgent: string = '*'): Promise { - const origin = new URL(url).origin; - let robot = this.cache.get(origin); - - if (!robot) { - robot = await this.fetchRobotsTxt(origin, userAgent); - this.cache.set(origin, robot); - } - - return robot.isAllowed(url, userAgent) ?? true; - } - - private async fetchRobotsTxt(origin: string, userAgent: string): Promise { - const robotsUrl = `${origin}/${ROBOTS_TXT_FILENAME}`; - - const internalClient = axios.create({ - headers: { - [HEADER_USER_AGENT]: userAgent, - } - }); - - try { - const response = await internalClient.get(robotsUrl); - return robotsParser(robotsUrl, response.data); - } catch (error: any) { - if (this.isUnavailable(error)) { - return robotsParser(robotsUrl, ALLOW_ALL_ROBOTS_TXT_CONTENT); - } - - throw new RobotsError(ERROR_MESSAGES.ROBOTS_UNREACHABLE(error.message)); - } - } - - private isUnavailable(error: any): boolean { - return error.response && error.response.status >= 400 && error.response.status < 500; - } -} diff --git a/src/domain/interfaces/IAllowService.ts b/src/domain/interfaces/IAllowService.ts new file mode 100644 index 0000000..9d1f030 --- /dev/null +++ b/src/domain/interfaces/IAllowService.ts @@ -0,0 +1,12 @@ +/** + * Service for checking if a URL is allowed to be crawled according to robots.txt rules. + */ +export interface IAllowService { + /** + * Checks if the given URL is allowed for the specified user agent. + * @param url The URL to check. + * @param userAgent The user agent to check against. + * @returns A promise resolving to true if allowed, false otherwise. + */ + isAllowed(url: string, userAgent?: string): Promise; +} diff --git a/src/domain/interfaces/ICrawlDelayService.ts b/src/domain/interfaces/ICrawlDelayService.ts new file mode 100644 index 0000000..f701299 --- /dev/null +++ b/src/domain/interfaces/ICrawlDelayService.ts @@ -0,0 +1,15 @@ +import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; + +/** + * Service for handling Crawl-delay directives from robots.txt. + */ +export interface ICrawlDelayService { + /** + * Enforces the crawl delay for a given URL based on the compliance mode. + * @param url The URL about to be requested. + * @param userAgent The user agent to check rules for. + * @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure). + * @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode). + */ + handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise; +} diff --git a/src/domain/interfaces/IRobotsDataRepository.ts b/src/domain/interfaces/IRobotsDataRepository.ts new file mode 100644 index 0000000..851d534 --- /dev/null +++ b/src/domain/interfaces/IRobotsDataRepository.ts @@ -0,0 +1,21 @@ +import { CachedRobot } from '../models/CachedRobot'; + +/** + * Repository for managing robots.txt data and crawl timestamps independently of the protocol logic. + */ +export interface IRobotsDataRepository { + /** + * Retrieves the cached robot instance for a given URL. + * @param url The URL to get the robot for (used to extract the domain/origin). + * @param userAgent Optional user agent to use for fetching robots.txt if not cached. + * @returns A promise resolving to the CachedRobot containing the parsed rules. + */ + getRobot(url: string, userAgent?: string): Promise; + + /** + * Updates the last crawled timestamp for the domain associated with the URL. + * @param url The URL identifying the domain. + * @param timestamp The timestamp to set. + */ + setLastCrawled(url: string, timestamp: number): void; +} diff --git a/src/domain/models/CachedRobot.ts b/src/domain/models/CachedRobot.ts new file mode 100644 index 0000000..cf1dc81 --- /dev/null +++ b/src/domain/models/CachedRobot.ts @@ -0,0 +1,12 @@ +import { Robot } from 'robots-parser'; + +export interface CachedRobot { + /** + * The parsed robots.txt object. + */ + robot: Robot; + /** + * Timestamp of the last crawl for this domain. + */ + lastCrawled?: number; +} diff --git a/src/domain/models/CrawlDelayComplianceMode.ts b/src/domain/models/CrawlDelayComplianceMode.ts new file mode 100644 index 0000000..5a8763d --- /dev/null +++ b/src/domain/models/CrawlDelayComplianceMode.ts @@ -0,0 +1,17 @@ +/** + * Options for the Robots Exclusion Protocol plugin. + */ +export enum CrawlDelayComplianceMode { + /** + * Respects the Crawl-delay directive by waiting before making the request. + */ + Await = 'await', + /** + * Ignores the Crawl-delay directive. + */ + Ignore = 'ignore', + /** + * Throws an error if the request violates the Crawl-delay. + */ + Failure = 'failure' +} diff --git a/src/domain/models/RobotsPluginOptions.ts b/src/domain/models/RobotsPluginOptions.ts new file mode 100644 index 0000000..7522d11 --- /dev/null +++ b/src/domain/models/RobotsPluginOptions.ts @@ -0,0 +1,13 @@ +import { CrawlDelayComplianceMode } from './CrawlDelayComplianceMode'; + +export interface RobotsPluginOptions { + /** + * The User-Agent string to use when checking robots.txt rules. + */ + userAgent: string; + /** + * How to handle Crawl-delay directives. + * Defaults to CrawlDelayComplianceMode.Await + */ + crawlDelayCompliance?: CrawlDelayComplianceMode; +} diff --git a/src/domain/services/AllowService.ts b/src/domain/services/AllowService.ts new file mode 100644 index 0000000..580b40b --- /dev/null +++ b/src/domain/services/AllowService.ts @@ -0,0 +1,16 @@ +import { IAllowService } from '../interfaces/IAllowService'; +import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository'; + +export class AllowService implements IAllowService { + constructor(private dataService: IRobotsDataRepository) { } + + async isAllowed(url: string, userAgent: string = '*'): Promise { + const robot = await this.dataService.getRobot(url, userAgent); + + if (!robot || !robot.robot) { + return true; + } + + return robot.robot.isAllowed(url, userAgent) ?? true; + } +} diff --git a/src/domain/services/CrawlDelayService.ts b/src/domain/services/CrawlDelayService.ts new file mode 100644 index 0000000..3753912 --- /dev/null +++ b/src/domain/services/CrawlDelayService.ts @@ -0,0 +1,24 @@ +import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; +import { ICrawlDelayService } from '../interfaces/ICrawlDelayService'; +import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; +import { CrawlDelayStrategyFactory } from '../strategies/CrawlDelayStrategyFactory'; + +export class CrawlDelayService implements ICrawlDelayService { + private calculateWaitTimeUseCase: CalculateWaitTimeUseCase; + private strategyFactory: CrawlDelayStrategyFactory; + + constructor(private dataService: IRobotsDataRepository) { + this.calculateWaitTimeUseCase = new CalculateWaitTimeUseCase(dataService); + this.strategyFactory = new CrawlDelayStrategyFactory(this.calculateWaitTimeUseCase); + } + + async handleCrawlDelay( + url: string, + userAgent: string, + complianceMode: CrawlDelayComplianceMode + ): Promise { + const strategy = this.strategyFactory.getStrategy(complianceMode); + await strategy.execute(url, userAgent); + } +} diff --git a/src/domain/strategies/AwaitCrawlDelayStrategy.ts b/src/domain/strategies/AwaitCrawlDelayStrategy.ts new file mode 100644 index 0000000..5c27ca2 --- /dev/null +++ b/src/domain/strategies/AwaitCrawlDelayStrategy.ts @@ -0,0 +1,17 @@ + +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; + +export class AwaitCrawlDelayStrategy implements ICrawlDelayStrategy { + constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { } + + async execute(url: string, userAgent: string): Promise { + const { waitTime } = await this.calculateWaitTimeUseCase.execute(url, userAgent); + + if (waitTime <= 0) + return; + + + await new Promise(resolve => setTimeout(resolve, waitTime)); + } +} diff --git a/src/domain/strategies/CrawlDelayStrategyFactory.ts b/src/domain/strategies/CrawlDelayStrategyFactory.ts new file mode 100644 index 0000000..0443d44 --- /dev/null +++ b/src/domain/strategies/CrawlDelayStrategyFactory.ts @@ -0,0 +1,22 @@ + +import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode'; +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; +import { AwaitCrawlDelayStrategy } from './AwaitCrawlDelayStrategy'; +import { FailureCrawlDelayStrategy } from './FailureCrawlDelayStrategy'; +import { IgnoreCrawlDelayStrategy } from './IgnoreCrawlDelayStrategy'; + +export class CrawlDelayStrategyFactory { + constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { } + + getStrategy(mode: CrawlDelayComplianceMode): ICrawlDelayStrategy { + switch (mode) { + case CrawlDelayComplianceMode.Failure: + return new FailureCrawlDelayStrategy(this.calculateWaitTimeUseCase); + case CrawlDelayComplianceMode.Ignore: + return new IgnoreCrawlDelayStrategy(); + case CrawlDelayComplianceMode.Await: + return new AwaitCrawlDelayStrategy(this.calculateWaitTimeUseCase); + } + } +} diff --git a/src/domain/strategies/FailureCrawlDelayStrategy.ts b/src/domain/strategies/FailureCrawlDelayStrategy.ts new file mode 100644 index 0000000..5b6c25e --- /dev/null +++ b/src/domain/strategies/FailureCrawlDelayStrategy.ts @@ -0,0 +1,16 @@ + +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../usecases/CalculateWaitTimeUseCase'; +import { CrawlDelayError } from '../../errors/CrawlDelayError'; + +export class FailureCrawlDelayStrategy implements ICrawlDelayStrategy { + constructor(private calculateWaitTimeUseCase: CalculateWaitTimeUseCase) { } + + async execute(url: string, userAgent: string): Promise { + const { waitTime, delay } = await this.calculateWaitTimeUseCase.execute(url, userAgent); + + if (waitTime <= 0) return; + + throw new CrawlDelayError(delay); + } +} diff --git a/src/domain/strategies/ICrawlDelayStrategy.ts b/src/domain/strategies/ICrawlDelayStrategy.ts new file mode 100644 index 0000000..12c0677 --- /dev/null +++ b/src/domain/strategies/ICrawlDelayStrategy.ts @@ -0,0 +1,12 @@ + +/** + * Strategy for ensuring compliance with Crawl-delay rules. + */ +export interface ICrawlDelayStrategy { + /** + * Executes the strategy for a given URL and user agent. + * @param url The URL about to be crawled. + * @param userAgent The user agent for which to check the rules. + */ + execute(url: string, userAgent: string): Promise; +} diff --git a/src/domain/strategies/IgnoreCrawlDelayStrategy.ts b/src/domain/strategies/IgnoreCrawlDelayStrategy.ts new file mode 100644 index 0000000..bb141a7 --- /dev/null +++ b/src/domain/strategies/IgnoreCrawlDelayStrategy.ts @@ -0,0 +1,8 @@ + +import { ICrawlDelayStrategy } from './ICrawlDelayStrategy'; + +export class IgnoreCrawlDelayStrategy implements ICrawlDelayStrategy { + async execute(url: string, userAgent: string): Promise { + return; + } +} diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts new file mode 100644 index 0000000..9d0f640 --- /dev/null +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -0,0 +1,27 @@ + +import { IRobotsDataRepository } from '../interfaces/IRobotsDataRepository'; + +export class CalculateWaitTimeUseCase { + constructor(private dataService: IRobotsDataRepository) { } + + async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> { + const cachedRobot = await this.dataService.getRobot(url, userAgent); + + if (!cachedRobot || !cachedRobot.robot) { + return { waitTime: 0, delay: 0 }; + } + + const delay = cachedRobot.robot.getCrawlDelay(userAgent); + if (!delay || delay <= 0 || !cachedRobot.lastCrawled) { + return { waitTime: 0, delay: 0 }; + } + + const timeSinceLastCrawl = Date.now() - cachedRobot.lastCrawled; + const waitTime = (delay * 1000) - timeSinceLastCrawl; + + return { + waitTime: waitTime > 0 ? waitTime : 0, + delay + }; + } +} diff --git a/src/errors/CrawlDelayError.ts b/src/errors/CrawlDelayError.ts new file mode 100644 index 0000000..331c7b4 --- /dev/null +++ b/src/errors/CrawlDelayError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class CrawlDelayError extends RobotsError { + constructor(delay: number) { + super(ERROR_MESSAGES.ROBOTS_CRAWL_DELAY(delay)); + this.name = 'CrawlDelayError'; + } +} diff --git a/src/errors/InvalidProtocolError.ts b/src/errors/InvalidProtocolError.ts new file mode 100644 index 0000000..5f4e9df --- /dev/null +++ b/src/errors/InvalidProtocolError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class InvalidProtocolError extends RobotsError { + constructor(protocol: string) { + super(ERROR_MESSAGES.INVALID_PROTOCOL(protocol)); + this.name = 'InvalidProtocolError'; + } +} diff --git a/src/errors/InvalidUrlError.ts b/src/errors/InvalidUrlError.ts new file mode 100644 index 0000000..c199ddc --- /dev/null +++ b/src/errors/InvalidUrlError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class InvalidUrlError extends RobotsError { + constructor(details: string) { + super(ERROR_MESSAGES.INVALID_URL(details)); + this.name = 'InvalidUrlError'; + } +} diff --git a/src/errors/RobotsDeniedError.ts b/src/errors/RobotsDeniedError.ts new file mode 100644 index 0000000..ebeb7a6 --- /dev/null +++ b/src/errors/RobotsDeniedError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class RobotsDeniedError extends RobotsError { + constructor(url: string, userAgent: string) { + super(ERROR_MESSAGES.ROBOTS_DENIED(url, userAgent)); + this.name = 'RobotsDeniedError'; + } +} diff --git a/src/errors/RobotsUnreachableError.ts b/src/errors/RobotsUnreachableError.ts new file mode 100644 index 0000000..d5b4828 --- /dev/null +++ b/src/errors/RobotsUnreachableError.ts @@ -0,0 +1,9 @@ +import { RobotsError } from './RobotsError'; +import { ERROR_MESSAGES } from './messages'; + +export class RobotsUnreachableError extends RobotsError { + constructor(details: string) { + super(ERROR_MESSAGES.ROBOTS_UNREACHABLE(details)); + this.name = 'RobotsUnreachableError'; + } +} diff --git a/src/errors/index.ts b/src/errors/index.ts new file mode 100644 index 0000000..20547b3 --- /dev/null +++ b/src/errors/index.ts @@ -0,0 +1,7 @@ +export * from './RobotsError'; +export * from './CrawlDelayError'; +export * from './InvalidUrlError'; +export * from './InvalidProtocolError'; +export * from './RobotsDeniedError'; +export * from './RobotsUnreachableError'; +export * from './messages'; diff --git a/src/errors/messages.ts b/src/errors/messages.ts index 6bb8c0a..ed2db9e 100644 --- a/src/errors/messages.ts +++ b/src/errors/messages.ts @@ -3,5 +3,6 @@ export const ERROR_MESSAGES = { INVALID_PROTOCOL: (protocol: string) => `Invalid protocol: ${protocol}. Only HTTP/S is supported for robots.txt compliance.`, ROBOTS_DENIED: (url: string, userAgent: string) => `URL ${url} is assumed to be disallowed by robots.txt for User-Agent ${userAgent}`, ROBOTS_UNREACHABLE: (details: string) => `Unable to fetch robots.txt: ${details}`, + ROBOTS_CRAWL_DELAY: (delay: number) => `Request blocked: Crawl-delay of ${delay}s has not been met.`, DEFAULT_BLOCK: 'Request blocked by robots.txt', }; diff --git a/src/index.ts b/src/index.ts index 1752e8d..004ed75 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,11 +1,18 @@ import { AxiosInstance } from 'axios'; import { RobotsInterceptor } from './interceptor'; -import { RobotsPluginOptions } from './types'; +import { RobotsPluginOptions } from './domain/models/RobotsPluginOptions'; -export * from './domain/RobotsService'; -export * from './errors/RobotsError'; +export * from './data/repositories/RobotsDataRepository'; +export * from './domain/services/AllowService'; +export * from './domain/services/CrawlDelayService'; +export * from './errors'; export * from './interceptor'; -export * from './types'; +export * from './domain/models/RobotsPluginOptions'; +export * from './domain/models/CrawlDelayComplianceMode'; +export * from './domain/models/CachedRobot'; +export * from './domain/interfaces/IRobotsDataRepository'; +export * from './domain/interfaces/IAllowService'; +export * from './domain/interfaces/ICrawlDelayService'; /** * Apply the robots exclusion protocol interceptor to an Axios instance. @@ -15,4 +22,8 @@ export * from './types'; export function applyRobotsInterceptor(axiosInstance: AxiosInstance, options: RobotsPluginOptions): void { const interceptor = new RobotsInterceptor(options); axiosInstance.interceptors.request.use((config) => interceptor.intercept(config)); + axiosInstance.interceptors.response.use( + (response) => interceptor.interceptResponse(response), + (error) => interceptor.interceptResponseError(error) + ); } diff --git a/src/interceptor.ts b/src/interceptor.ts index b2c484e..f6b09e8 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -1,17 +1,38 @@ -import { InternalAxiosRequestConfig } from 'axios'; -import { IRobotsService, RobotsPluginOptions } from './types'; -import { RobotsService } from './domain/RobotsService'; -import { RobotsError } from './errors/RobotsError'; +import { InternalAxiosRequestConfig, AxiosResponse } from 'axios'; +import { RobotsPluginOptions } from './domain/models/RobotsPluginOptions'; +import { CrawlDelayComplianceMode } from './domain/models/CrawlDelayComplianceMode'; +import { IRobotsDataRepository } from './domain/interfaces/IRobotsDataRepository'; +import { IAllowService } from './domain/interfaces/IAllowService'; +import { ICrawlDelayService } from './domain/interfaces/ICrawlDelayService'; +import { RobotsDataRepository } from './data/repositories/RobotsDataRepository'; +import { AllowService } from './domain/services/AllowService'; +import { CrawlDelayService } from './domain/services/CrawlDelayService'; +import { InvalidUrlError } from './errors/InvalidUrlError'; +import { InvalidProtocolError } from './errors/InvalidProtocolError'; +import { RobotsDeniedError } from './errors/RobotsDeniedError'; import { HEADER_USER_AGENT, PROTOCOL_HTTP, PROTOCOL_HTTPS } from './constants'; -import { ERROR_MESSAGES } from './errors/messages'; export class RobotsInterceptor { - private robotsService: IRobotsService; + private dataService: IRobotsDataRepository; + private allowService: IAllowService; + private crawlDelayService: ICrawlDelayService; private userAgent: string; + private crawlDelayCompliance: CrawlDelayComplianceMode; - constructor(options: RobotsPluginOptions, robotsService?: IRobotsService) { - this.robotsService = robotsService || new RobotsService(); + constructor( + options: RobotsPluginOptions, + deps?: { + dataService?: IRobotsDataRepository, + allowService?: IAllowService, + crawlDelayService?: ICrawlDelayService; + } + ) { this.userAgent = options.userAgent; + this.crawlDelayCompliance = options.crawlDelayCompliance ?? CrawlDelayComplianceMode.Await; + + this.dataService = deps?.dataService ?? new RobotsDataRepository(); + this.allowService = deps?.allowService ?? new AllowService(this.dataService); + this.crawlDelayService = deps?.crawlDelayService ?? new CrawlDelayService(this.dataService); } /** @@ -25,10 +46,14 @@ export class RobotsInterceptor { const url = this.resolveUrl(config); this.validateProtocol(url); - const isAllowed = await this.robotsService.isAllowed(url.toString(), this.userAgent); + const isAllowed = await this.allowService.isAllowed(url.toString(), this.userAgent); if (!isAllowed) { - throw new RobotsError(ERROR_MESSAGES.ROBOTS_DENIED(url.toString(), this.userAgent)); + throw new RobotsDeniedError(url.toString(), this.userAgent); + } + + if (this.crawlDelayCompliance !== CrawlDelayComplianceMode.Ignore) { + await this.crawlDelayService.handleCrawlDelay(url.toString(), this.userAgent, this.crawlDelayCompliance); } if (config.headers) { @@ -38,6 +63,41 @@ export class RobotsInterceptor { return config; } + /** + * Intercepts Axios responses to update the last crawled timestamp. + */ + public interceptResponse(response: AxiosResponse): AxiosResponse { + if (!response || !response.config || !response.config.url) { + return response; + } + + try { + const fullUrl = this.resolveUrl(response.config as InternalAxiosRequestConfig).toString(); + this.dataService.setLastCrawled(fullUrl, Date.now()); + } catch (_) { + } + + return response; + } + + /** + * Intercepts Axios response errors to update the last crawled timestamp, + * ensuring we track attempts even if they fail. + */ + public interceptResponseError(error: any): any { + if (!error || !error.config || !error.config.url) { + return Promise.reject(error); + } + + try { + const fullUrl = this.resolveUrl(error.config as InternalAxiosRequestConfig).toString(); + this.dataService.setLastCrawled(fullUrl, Date.now()); + } catch (_) { + } + + return Promise.reject(error); + } + private resolveUrl(config: InternalAxiosRequestConfig): URL { try { if (config.url && (config.url.startsWith(PROTOCOL_HTTP) || config.url.startsWith(PROTOCOL_HTTPS))) { @@ -50,13 +110,12 @@ export class RobotsInterceptor { return new URL(config.url || ''); } catch (e: any) { - throw new RobotsError(ERROR_MESSAGES.INVALID_URL(e.message)); + throw new InvalidUrlError(e.message); } } private validateProtocol(url: URL): void { - if (url.protocol !== PROTOCOL_HTTP && url.protocol !== PROTOCOL_HTTPS) { - throw new RobotsError(ERROR_MESSAGES.INVALID_PROTOCOL(url.protocol)); - } + if (url.protocol === PROTOCOL_HTTP || url.protocol === PROTOCOL_HTTPS) return; + throw new InvalidProtocolError(url.protocol); } } diff --git a/src/types.ts b/src/types.ts deleted file mode 100644 index 6241cd6..0000000 --- a/src/types.ts +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Options for the Robots Exclusion Protocol plugin. - */ -export interface RobotsPluginOptions { - /** - * The User-Agent string to use when checking robots.txt rules. - */ - userAgent: string; -} - -/** - * Interface for the Robots Service. - */ -export interface IRobotsService { - isAllowed(url: string, userAgent?: string): Promise; -} diff --git a/tests/integration/crawl-delay.test.ts b/tests/integration/crawl-delay.test.ts new file mode 100644 index 0000000..f3ff404 --- /dev/null +++ b/tests/integration/crawl-delay.test.ts @@ -0,0 +1,154 @@ +import axios from 'axios'; +import nock from 'nock'; +import { applyRobotsInterceptor, CrawlDelayComplianceMode } from '../../src/index'; + +describe('Crawl-delay Compliance', () => { + let client: ReturnType; + const USER_AGENT = 'CrawlBot/1.0'; + const DOMAIN = 'https://crawl-delay.com'; + + beforeEach(() => { + nock.cleanAll(); + client = axios.create(); + jest.useFakeTimers({ + doNotFake: ['nextTick', 'setImmediate'] + }); + }); + + afterEach(() => { + jest.useRealTimers(); + }); + + test.each([ + [1, 1000], + [2, 2000], + [3, 3000] + ])(` +GIVEN a robots.txt with Crawl-delay: %i +WHEN making consecutive requests +THEN the second request should wait at least %i ms + `, async (delaySeconds, expectedDelayMs) => { + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Await + }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: ${delaySeconds} + Allow: / + `); + + nock(DOMAIN).get('/one').reply(200, 'One'); + nock(DOMAIN).get('/two').reply(200, 'Two'); + + await client.get(`${DOMAIN}/one`); + const afterFirst = Date.now(); + + const requestPromise = client.get(`${DOMAIN}/two`); + + jest.advanceTimersByTime(expectedDelayMs); + + await requestPromise; + const end = Date.now(); + + const duration = end - afterFirst; + expect(duration).toBeGreaterThanOrEqual(expectedDelayMs); + }); + + test(` +GIVEN a request fails +WHEN making a subsequent request +THEN it should still respect the Crawl-delay + `, async () => { + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Await + }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: 2 + Allow: / + `); + + nock(DOMAIN).get('/fail').reply(500, 'Server Error'); + nock(DOMAIN).get('/success').reply(200, 'Success'); + + try { + await client.get(`${DOMAIN}/fail`); + } catch (e) { + } + const afterFail = Date.now(); + + const requestPromise = client.get(`${DOMAIN}/success`); + + jest.advanceTimersByTime(2000); + + await requestPromise; + const end = Date.now(); + + const duration = end - afterFail; + expect(duration).toBeGreaterThanOrEqual(2000); + }); + + test(` +GIVEN crawlDelayCompliance is Ignore +WHEN making consecutive requests +THEN the second request should NOT wait + `, async () => { + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Ignore + }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: 5 + Allow: / + `); + + nock(DOMAIN).get('/one').reply(200, 'One'); + nock(DOMAIN).get('/two').reply(200, 'Two'); + + const start = Date.now(); + await client.get(`${DOMAIN}/one`); + await client.get(`${DOMAIN}/two`); + const end = Date.now(); + + const duration = end - start; + expect(duration).toBeLessThan(1000); + }); + + test(` +GIVEN crawlDelayCompliance is Failure +WHEN making consecutive requests +THEN the second request should throw + `, async () => { + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + crawlDelayCompliance: CrawlDelayComplianceMode.Failure + }); + + nock(DOMAIN) + .get('/robots.txt') + .reply(200, ` + User-agent: * + Crawl-delay: 5 + Allow: / + `); + + nock(DOMAIN).get('/one').reply(200, 'One'); + nock(DOMAIN).get('/two').reply(200, 'Two'); + + await client.get(`${DOMAIN}/one`); + + await expect(client.get(`${DOMAIN}/two`)).rejects.toThrow('Crawl-delay of 5s has not been met'); + }); +}); diff --git a/tests/index.test.ts b/tests/integration/interceptor.test.ts similarity index 68% rename from tests/index.test.ts rename to tests/integration/interceptor.test.ts index 5504ff6..29179af 100644 --- a/tests/index.test.ts +++ b/tests/integration/interceptor.test.ts @@ -1,8 +1,8 @@ import axios from 'axios'; import nock from 'nock'; -import { applyRobotsInterceptor } from '../src/index'; -import { RobotsError } from '../src/errors/RobotsError'; -import { HEADER_USER_AGENT } from '../src/constants'; +import { applyRobotsInterceptor } from '../../src/index'; +import { RobotsError } from '../../src/errors/RobotsError'; +import { HEADER_USER_AGENT } from '../../src/constants'; describe('Axios Robots Interceptor', () => { let client: ReturnType; @@ -16,7 +16,11 @@ describe('Axios Robots Interceptor', () => { }); describe('RFC Compliance: Access Rules', () => { - test('GIVEN a robots.txt with a specific Disallow rule WHEN the bot requests a matching path THEN it should throw a RobotsError', async () => { + test(` +GIVEN a robots.txt with a specific Disallow rule +WHEN the bot requests a matching path +THEN it should throw a RobotsError + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -29,7 +33,11 @@ describe('Axios Robots Interceptor', () => { await expect(client.get(`${DOMAIN}/private`)).rejects.toThrow(RobotsError); }); - test('GIVEN a robots.txt with a specific Allow rule WHEN the bot requests a matching path THEN it should allow the request', async () => { + test(` +GIVEN a robots.txt with a specific Allow rule +WHEN the bot requests a matching path +THEN it should allow the request + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -45,7 +53,11 @@ describe('Axios Robots Interceptor', () => { expect(response.data).toBe('Public Data'); }); - test('GIVEN a robots.txt with a wildcard Disallow rule WHEN the bot requests a matching file THEN it should throw a RobotsError', async () => { + test(` +GIVEN a robots.txt with a wildcard Disallow rule +WHEN the bot requests a matching file +THEN it should throw a RobotsError + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -56,7 +68,11 @@ describe('Axios Robots Interceptor', () => { }); describe('RFC Compliance: User-Agent Matching', () => { - test('GIVEN a robots.txt with specific rules for TestBot WHEN TestBot requests a URL THEN it should follow the specific rules', async () => { + test(` +GIVEN a robots.txt with specific rules for TestBot +WHEN TestBot requests a URL +THEN it should follow the specific rules + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -76,7 +92,11 @@ describe('Axios Robots Interceptor', () => { }); describe('RFC Compliance: Status Codes (Availability)', () => { - test('GIVEN the robots.txt endpoint returns 404 (Not Found) WHEN a request is made THEN it should allow access', async () => { + test(` +GIVEN the robots.txt endpoint returns 404 (Not Found) +WHEN a request is made +THEN it should allow access + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -91,7 +111,11 @@ describe('Axios Robots Interceptor', () => { expect(response.status).toBe(200); }); - test('GIVEN the robots.txt endpoint returns 403 (Forbidden) WHEN a request is made THEN it should allow access (Unavailable = Allow)', async () => { + test(` +GIVEN the robots.txt endpoint returns 403 (Forbidden) +WHEN a request is made +THEN it should allow access (Unavailable = Allow) + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -106,7 +130,11 @@ describe('Axios Robots Interceptor', () => { expect(response.status).toBe(200); }); - test('GIVEN the robots.txt endpoint returns 500 (Internal Server Error) WHEN a request is made THEN it should throw a RobotsError (Unreachable = Disallow)', async () => { + test(` +GIVEN the robots.txt endpoint returns 500 (Internal Server Error) +WHEN a request is made +THEN it should throw a RobotsError (Unreachable = Disallow) + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -117,17 +145,29 @@ describe('Axios Robots Interceptor', () => { }); describe('Interceptor Logic & Safety', () => { - test('GIVEN an invalid URL WHEN a request is made THEN it should throw a RobotsError', async () => { + test(` +GIVEN an invalid URL +WHEN a request is made +THEN it should throw a RobotsError + `, async () => { await expect(client.get('not-a-url')).rejects.toThrow(/Invalid URL/); }); - test('GIVEN a non-HTTP protocol WHEN a request is made THEN it should throw a RobotsError', async () => { + test(` +GIVEN a non-HTTP protocol +WHEN a request is made +THEN it should throw a RobotsError + `, async () => { await expect(client.get('ftp://example.com/file')).rejects.toThrow(/Invalid protocol/); }); - test('GIVEN a valid config WHEN fetching robots.txt THEN it should send the configured User-Agent header', async () => { + test(` +GIVEN a valid config +WHEN fetching robots.txt +THEN it should send the configured User-Agent header + `, async () => { nock(DOMAIN) .get('/robots.txt') @@ -145,7 +185,11 @@ describe('Axios Robots Interceptor', () => { }); describe('Caching', () => { - test('GIVEN a cached robots.txt WHEN making a second request to the same origin THEN it should not make a second network request for robots.txt', async () => { + test(` +GIVEN a cached robots.txt +WHEN making a second request to the same origin +THEN it should not make a second network request for robots.txt + `, async () => { const scope = nock(DOMAIN) .get('/robots.txt') diff --git a/tests/unit/domain/services/AllowService.test.ts b/tests/unit/domain/services/AllowService.test.ts new file mode 100644 index 0000000..0f63fa9 --- /dev/null +++ b/tests/unit/domain/services/AllowService.test.ts @@ -0,0 +1,86 @@ +import { AllowService } from '../../../../src/domain/services/AllowService'; +import { IRobotsDataRepository } from '../../../../src/domain/interfaces/IRobotsDataRepository'; +import { CachedRobot } from '../../../../src/domain/models/CachedRobot'; + +describe('AllowService', () => { + let service: AllowService; + let mockDataRepository: jest.Mocked; + + beforeEach(() => { + mockDataRepository = { + getRobot: jest.fn(), + setLastCrawled: jest.fn(), + }; + service = new AllowService(mockDataRepository); + }); + + test(` +GIVEN no robot data is found +WHEN checking if a URL is allowed +THEN it should return true (default allow) + `, async () => { + mockDataRepository.getRobot.mockResolvedValue(null as any); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + }); + + test(` +GIVEN robot data exists but has no robot object +WHEN checking if a URL is allowed +THEN it should return true + `, async () => { + mockDataRepository.getRobot.mockResolvedValue({ robot: null } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + }); + + test(` +GIVEN robot rules exist and allow the URL +WHEN checking if a URL is allowed +THEN it should return true + `, async () => { + const mockRobot = { + isAllowed: jest.fn().mockReturnValue(true) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + expect(mockRobot.isAllowed).toHaveBeenCalledWith('https://example.com/foo', '*'); + }); + + test(` +GIVEN robot rules exist and disallow the URL +WHEN checking if a URL is allowed +THEN it should return false + `, async () => { + const mockRobot = { + isAllowed: jest.fn().mockReturnValue(false) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/private'); + + expect(result).toBe(false); + }); + + test(` +GIVEN robot rules exist but isAllowed returns undefined +WHEN checking if a URL is allowed +THEN it should return true (default to allowed) + `, async () => { + const mockRobot = { + isAllowed: jest.fn().mockReturnValue(undefined) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await service.isAllowed('https://example.com/foo'); + + expect(result).toBe(true); + }); +}); diff --git a/tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts b/tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts new file mode 100644 index 0000000..d8a8a7f --- /dev/null +++ b/tests/unit/domain/strategies/AwaitCrawlDelayStrategy.test.ts @@ -0,0 +1,52 @@ +import { AwaitCrawlDelayStrategy } from '../../../../src/domain/strategies/AwaitCrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; + +describe('AwaitCrawlDelayStrategy', () => { + let strategy: AwaitCrawlDelayStrategy; + let mockUseCase: jest.Mocked; + + beforeEach(() => { + mockUseCase = { + execute: jest.fn(), + } as unknown as jest.Mocked; + strategy = new AwaitCrawlDelayStrategy(mockUseCase); + jest.useFakeTimers({ + doNotFake: ['nextTick', 'setImmediate'] + }); + }); + + afterEach(() => { + jest.useRealTimers(); + }); + + test(` +GIVEN wait time is 0 +WHEN executing strategy +THEN it should return immediately without waiting + `, async () => { + mockUseCase.execute.mockResolvedValue({ waitTime: 0, delay: 5 }); + const setTimeoutSpy = jest.spyOn(global, 'setTimeout'); + + await strategy.execute('https://example.com', '*'); + + expect(setTimeoutSpy).not.toHaveBeenCalled(); + }); + + test(` +GIVEN wait time is greater than 0 +WHEN executing strategy +THEN it should wait for the specified time + `, async () => { + const waitTime = 1000; + mockUseCase.execute.mockResolvedValue({ waitTime, delay: 5 }); + + const executePromise = strategy.execute('https://example.com', '*'); + + await Promise.resolve(); + await Promise.resolve(); + + jest.advanceTimersByTime(waitTime); + + await executePromise; + }); +}); diff --git a/tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts b/tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts new file mode 100644 index 0000000..7d6602c --- /dev/null +++ b/tests/unit/domain/strategies/CrawlDelayStrategyFactory.test.ts @@ -0,0 +1,43 @@ +import { CrawlDelayStrategyFactory } from '../../../../src/domain/strategies/CrawlDelayStrategyFactory'; +import { AwaitCrawlDelayStrategy } from '../../../../src/domain/strategies/AwaitCrawlDelayStrategy'; +import { FailureCrawlDelayStrategy } from '../../../../src/domain/strategies/FailureCrawlDelayStrategy'; +import { IgnoreCrawlDelayStrategy } from '../../../../src/domain/strategies/IgnoreCrawlDelayStrategy'; +import { CrawlDelayComplianceMode } from '../../../../src/domain/models/CrawlDelayComplianceMode'; +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; + +describe('CrawlDelayStrategyFactory', () => { + let factory: CrawlDelayStrategyFactory; + let mockUseCase: CalculateWaitTimeUseCase; + + beforeEach(() => { + mockUseCase = {} as CalculateWaitTimeUseCase; + factory = new CrawlDelayStrategyFactory(mockUseCase); + }); + + test(` +GIVEN mode is Await +WHEN getting strategy +THEN it should return AwaitCrawlDelayStrategy + `, () => { + const strategy = factory.getStrategy(CrawlDelayComplianceMode.Await); + expect(strategy).toBeInstanceOf(AwaitCrawlDelayStrategy); + }); + + test(` +GIVEN mode is Failure +WHEN getting strategy +THEN it should return FailureCrawlDelayStrategy + `, () => { + const strategy = factory.getStrategy(CrawlDelayComplianceMode.Failure); + expect(strategy).toBeInstanceOf(FailureCrawlDelayStrategy); + }); + + test(` +GIVEN mode is Ignore +WHEN getting strategy +THEN it should return IgnoreCrawlDelayStrategy + `, () => { + const strategy = factory.getStrategy(CrawlDelayComplianceMode.Ignore); + expect(strategy).toBeInstanceOf(IgnoreCrawlDelayStrategy); + }); +}); diff --git a/tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts b/tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts new file mode 100644 index 0000000..eddbbad --- /dev/null +++ b/tests/unit/domain/strategies/FailureCrawlDelayStrategy.test.ts @@ -0,0 +1,35 @@ +import { FailureCrawlDelayStrategy } from '../../../../src/domain/strategies/FailureCrawlDelayStrategy'; +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; +import { CrawlDelayError } from '../../../../src/errors/CrawlDelayError'; + +describe('FailureCrawlDelayStrategy', () => { + let strategy: FailureCrawlDelayStrategy; + let mockUseCase: jest.Mocked; + + beforeEach(() => { + mockUseCase = { + execute: jest.fn(), + } as unknown as jest.Mocked; + strategy = new FailureCrawlDelayStrategy(mockUseCase); + }); + + test(` +GIVEN wait time is 0 +WHEN executing strategy +THEN it should return successfully + `, async () => { + mockUseCase.execute.mockResolvedValue({ waitTime: 0, delay: 5 }); + + await expect(strategy.execute('https://example.com', '*')).resolves.not.toThrow(); + }); + + test(` +GIVEN wait time is greater than 0 +WHEN executing strategy +THEN it should throw a CrawlDelayError + `, async () => { + mockUseCase.execute.mockResolvedValue({ waitTime: 1000, delay: 5 }); + + await expect(strategy.execute('https://example.com', '*')).rejects.toThrow(CrawlDelayError); + }); +}); diff --git a/tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts b/tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts new file mode 100644 index 0000000..351e4d5 --- /dev/null +++ b/tests/unit/domain/strategies/IgnoreCrawlDelayStrategy.test.ts @@ -0,0 +1,17 @@ +import { IgnoreCrawlDelayStrategy } from '../../../../src/domain/strategies/IgnoreCrawlDelayStrategy'; + +describe('IgnoreCrawlDelayStrategy', () => { + let strategy: IgnoreCrawlDelayStrategy; + + beforeEach(() => { + strategy = new IgnoreCrawlDelayStrategy(); + }); + + test(` +GIVEN any conditions +WHEN executing strategy +THEN it should return successfully (no op) + `, async () => { + await expect(strategy.execute('https://example.com', '*')).resolves.not.toThrow(); + }); +}); diff --git a/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts new file mode 100644 index 0000000..215dd1f --- /dev/null +++ b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts @@ -0,0 +1,115 @@ +import { CalculateWaitTimeUseCase } from '../../../../src/domain/usecases/CalculateWaitTimeUseCase'; +import { IRobotsDataRepository } from '../../../../src/domain/interfaces/IRobotsDataRepository'; +import { CachedRobot } from '../../../../src/domain/models/CachedRobot'; + +describe('CalculateWaitTimeUseCase', () => { + let useCase: CalculateWaitTimeUseCase; + let mockDataRepository: jest.Mocked; + + beforeEach(() => { + mockDataRepository = { + getRobot: jest.fn(), + setLastCrawled: jest.fn(), + }; + useCase = new CalculateWaitTimeUseCase(mockDataRepository); + }); + + test(` +GIVEN no robot data +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + mockDataRepository.getRobot.mockResolvedValue(null as any); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result).toEqual({ waitTime: 0, delay: 0 }); + }); + + test(` +GIVEN robot data with no crawl delay +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(undefined) + }; + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result).toEqual({ waitTime: 0, delay: 0 }); + }); + + test(` +GIVEN robot data with crawl delay but never crawled before +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(5) + }; + mockDataRepository.getRobot.mockResolvedValue({ + robot: mockRobot, + lastCrawled: undefined + } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result).toEqual({ waitTime: 0, delay: 0 }); + }); + + test(` +GIVEN robot data with crawl delay and previously crawled recently +WHEN calculating wait time +THEN it should return the remaining wait time + `, async () => { + const delaySeconds = 2; + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) + }; + + // Crawled 1000ms ago, delay is 2000ms, should wait 1000ms + const now = Date.now(); + const lastCrawled = now - 1000; + + jest.spyOn(Date, 'now').mockReturnValue(now); + + mockDataRepository.getRobot.mockResolvedValue({ + robot: mockRobot, + lastCrawled: lastCrawled + } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result.waitTime).toBe(1000); + expect(result.delay).toBe(delaySeconds); + }); + + test(` +GIVEN robot data with crawl delay and previously crawled long ago +WHEN calculating wait time +THEN it should return 0 wait time + `, async () => { + const delaySeconds = 2; + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) + }; + + // Crawled 3000ms ago, delay is 2000ms, should wait 0ms + const now = Date.now(); + const lastCrawled = now - 3000; + + jest.spyOn(Date, 'now').mockReturnValue(now); + + mockDataRepository.getRobot.mockResolvedValue({ + robot: mockRobot, + lastCrawled: lastCrawled + } as unknown as CachedRobot); + + const result = await useCase.execute('https://example.com', '*'); + + expect(result.waitTime).toBe(0); + expect(result.delay).toBe(delaySeconds); + }); +});