From 29c36cc63fee66e5016f83e194b05d732ba9ee10 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:24:09 +0900 Subject: [PATCH 01/10] add new caching strategy --- src/data/repositories/RobotsDataRepository.ts | 17 ++++++++++++++--- src/domain/interfaces/IRobotsDataRepository.ts | 3 ++- src/domain/models/CachedRobot.ts | 4 ++++ src/domain/models/CachingPolicy.ts | 11 ++++++++++- src/domain/models/CachingPolicyType.ts | 6 +++++- .../caching/CachingStrategyFactory.ts | 3 +++ .../caching/RequestCountCachingStrategy.ts | 14 ++++++++++++++ src/domain/usecases/CalculateWaitTimeUseCase.ts | 2 +- 8 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 src/domain/strategies/caching/RequestCountCachingStrategy.ts diff --git a/src/data/repositories/RobotsDataRepository.ts b/src/data/repositories/RobotsDataRepository.ts index 8dfca54..c20f1bc 100644 --- a/src/data/repositories/RobotsDataRepository.ts +++ b/src/data/repositories/RobotsDataRepository.ts @@ -19,16 +19,27 @@ export class RobotsDataRepository implements IRobotsDataRepository { this.strategyFactory = new CachingStrategyFactory(); } - async getRobot(url: string, userAgent: string = '*'): Promise { + async getRobot( + url: string, + userAgent: string = '*', + options: { incrementUsage?: boolean; ignoreCachePolicy?: boolean } = {} + ): Promise { + const { incrementUsage = true, ignoreCachePolicy = false } = options; const origin = new URL(url).origin; let cached = this.cache.get(origin); - if (cached && this.strategyFactory.getStrategy(this.cachingPolicy).isValid(cached)) { + const strategy = this.strategyFactory.getStrategy(this.cachingPolicy); + const isValid = cached && (ignoreCachePolicy || strategy.isValid(cached)); + + if (isValid && cached) { + if (incrementUsage) { + cached.usageCount = (cached.usageCount || 0) + 1; + } return cached; } const robot = await this.fetchRobotsTxt(origin, userAgent); - cached = { robot, fetchedAt: Date.now() }; + cached = { robot, fetchedAt: Date.now(), usageCount: 1 }; this.cache.set(origin, cached); return cached; diff --git a/src/domain/interfaces/IRobotsDataRepository.ts b/src/domain/interfaces/IRobotsDataRepository.ts index 851d534..8d70f00 100644 --- a/src/domain/interfaces/IRobotsDataRepository.ts +++ b/src/domain/interfaces/IRobotsDataRepository.ts @@ -8,9 +8,10 @@ export interface IRobotsDataRepository { * Retrieves the cached robot instance for a given URL. * @param url The URL to get the robot for (used to extract the domain/origin). * @param userAgent Optional user agent to use for fetching robots.txt if not cached. + * @param options Optional retrieval options. * @returns A promise resolving to the CachedRobot containing the parsed rules. */ - getRobot(url: string, userAgent?: string): Promise; + getRobot(url: string, userAgent?: string, options?: { incrementUsage?: boolean; ignoreCachePolicy?: boolean; }): Promise; /** * Updates the last crawled timestamp for the domain associated with the URL. diff --git a/src/domain/models/CachedRobot.ts b/src/domain/models/CachedRobot.ts index 982d40e..ba5485e 100644 --- a/src/domain/models/CachedRobot.ts +++ b/src/domain/models/CachedRobot.ts @@ -13,4 +13,8 @@ export interface CachedRobot { * Timestamp of when the robots.txt was fetched. */ fetchedAt: number; + /** + * Number of times this cached robot has been accessed. + */ + usageCount?: number; } diff --git a/src/domain/models/CachingPolicy.ts b/src/domain/models/CachingPolicy.ts index ffcf9d7..1fcd309 100644 --- a/src/domain/models/CachingPolicy.ts +++ b/src/domain/models/CachingPolicy.ts @@ -1,6 +1,6 @@ import { CachingPolicyType } from './CachingPolicyType'; -export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy; +export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy | RequestCountCachingPolicy; export interface IndefiniteCachingPolicy { type: CachingPolicyType.Indefinite; @@ -13,3 +13,12 @@ export interface ExpireAfterCachingPolicy { */ duration: string | number; } + +export interface RequestCountCachingPolicy { + type: CachingPolicyType.RequestCount; + /** + * Maximum number of requests before the cache expires. + */ + maxRequests: number; +} + diff --git a/src/domain/models/CachingPolicyType.ts b/src/domain/models/CachingPolicyType.ts index 0eb5bd5..c252d73 100644 --- a/src/domain/models/CachingPolicyType.ts +++ b/src/domain/models/CachingPolicyType.ts @@ -9,5 +9,9 @@ export enum CachingPolicyType { /** * Cache robots.txt data for a specific duration. */ - ExpireAfter = 'expireAfter' + ExpireAfter = 'expireAfter', + /** + * Cache robots.txt data for a specific number of requests. + */ + RequestCount = 'requestCount' } diff --git a/src/domain/strategies/caching/CachingStrategyFactory.ts b/src/domain/strategies/caching/CachingStrategyFactory.ts index 745c6a3..2c57f4f 100644 --- a/src/domain/strategies/caching/CachingStrategyFactory.ts +++ b/src/domain/strategies/caching/CachingStrategyFactory.ts @@ -3,6 +3,7 @@ import { CachingPolicyType } from '../../models/CachingPolicyType'; import { ICachingStrategy } from './ICachingStrategy'; import { IndefiniteCachingStrategy } from './IndefiniteCachingStrategy'; import { ExpireAfterCachingStrategy } from './ExpireAfterCachingStrategy'; +import { RequestCountCachingStrategy } from './RequestCountCachingStrategy'; export class CachingStrategyFactory { getStrategy(policy: CachingPolicy): ICachingStrategy { @@ -11,6 +12,8 @@ export class CachingStrategyFactory { return new IndefiniteCachingStrategy(); case CachingPolicyType.ExpireAfter: return new ExpireAfterCachingStrategy(policy.duration); + case CachingPolicyType.RequestCount: + return new RequestCountCachingStrategy(policy.maxRequests); default: return new IndefiniteCachingStrategy(); } diff --git a/src/domain/strategies/caching/RequestCountCachingStrategy.ts b/src/domain/strategies/caching/RequestCountCachingStrategy.ts new file mode 100644 index 0000000..1b31b98 --- /dev/null +++ b/src/domain/strategies/caching/RequestCountCachingStrategy.ts @@ -0,0 +1,14 @@ +import { ICachingStrategy } from './ICachingStrategy'; +import { CachedRobot } from '../../models/CachedRobot'; + +export class RequestCountCachingStrategy implements ICachingStrategy { + private maxRequests: number; + + constructor(maxRequests: number) { + this.maxRequests = maxRequests; + } + + isValid(cached: CachedRobot): boolean { + return (cached.usageCount || 0) < this.maxRequests; + } +} diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts index 9d0f640..072ec1e 100644 --- a/src/domain/usecases/CalculateWaitTimeUseCase.ts +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -5,7 +5,7 @@ export class CalculateWaitTimeUseCase { constructor(private dataService: IRobotsDataRepository) { } async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> { - const cachedRobot = await this.dataService.getRobot(url, userAgent); + const cachedRobot = await this.dataService.getRobot(url, userAgent, { incrementUsage: false, ignoreCachePolicy: true }); if (!cachedRobot || !cachedRobot.robot) { return { waitTime: 0, delay: 0 }; From 07b3217a1e29fbeb2bc1524060b02bfcff105601 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:24:48 +0900 Subject: [PATCH 02/10] create unit tests for caching strategies --- .../caching/CachingStrategyFactory.test.ts | 62 ++++++++++++++++ .../ExpireAfterCachingStrategy.test.ts | 70 ++++++++++++++++++ .../caching/IndefiniteCachingStrategy.test.ts | 25 +++++++ .../RequestCountCachingStrategy.test.ts | 74 +++++++++++++++++++ 4 files changed, 231 insertions(+) create mode 100644 tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts create mode 100644 tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts create mode 100644 tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts create mode 100644 tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts diff --git a/tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts b/tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts new file mode 100644 index 0000000..950d043 --- /dev/null +++ b/tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts @@ -0,0 +1,62 @@ +import { CachingStrategyFactory } from '../../../../../src/domain/strategies/caching/CachingStrategyFactory'; +import { IndefiniteCachingStrategy } from '../../../../../src/domain/strategies/caching/IndefiniteCachingStrategy'; +import { ExpireAfterCachingStrategy } from '../../../../../src/domain/strategies/caching/ExpireAfterCachingStrategy'; +import { RequestCountCachingStrategy } from '../../../../../src/domain/strategies/caching/RequestCountCachingStrategy'; +import { CachingPolicyType } from '../../../../../src/domain/models/CachingPolicyType'; + +describe('CachingStrategyFactory', () => { + let factory: CachingStrategyFactory; + + beforeEach(() => { + factory = new CachingStrategyFactory(); + }); + + test(` + GIVEN an indefinite caching policy + WHEN requesting a strategy + THEN it should return an IndefiniteCachingStrategy instance + `, () => { + const policy = { type: CachingPolicyType.Indefinite } as const; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(IndefiniteCachingStrategy); + }); + + test(` + GIVEN an expireAfter caching policy + WHEN requesting a strategy + THEN it should return an ExpireAfterCachingStrategy instance + `, () => { + const policy = { type: CachingPolicyType.ExpireAfter, duration: '1h' } as const; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(ExpireAfterCachingStrategy); + }); + + test(` + GIVEN a requestCount caching policy + WHEN requesting a strategy + THEN it should return a RequestCountCachingStrategy instance + `, () => { + const policy = { type: CachingPolicyType.RequestCount, maxRequests: 5 } as const; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(RequestCountCachingStrategy); + }); + + test(` + GIVEN an unknown caching policy type + WHEN requesting a strategy + THEN it should return an IndefiniteCachingStrategy instance (default) + `, () => { + // Casting to any to simulate invalid input that might bypass type checking or come from external sources + const policy = { type: 'unknown' as any }; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(IndefiniteCachingStrategy); + }); +}); diff --git a/tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts b/tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts new file mode 100644 index 0000000..04af5d0 --- /dev/null +++ b/tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts @@ -0,0 +1,70 @@ +import { ExpireAfterCachingStrategy } from '../../../../../src/domain/strategies/caching/ExpireAfterCachingStrategy'; +import { CachedRobot } from '../../../../../src/domain/models/CachedRobot'; + +describe('ExpireAfterCachingStrategy', () => { + + test(` + GIVEN cached data is not expired + WHEN validating cached robot + THEN it should return true + `, () => { + const duration = 1000; + const strategy = new ExpireAfterCachingStrategy(duration); + const now = Date.now(); + jest.spyOn(Date, 'now').mockReturnValue(now); + + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: now - 500 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); + + test(` + GIVEN cached data is expired + WHEN validating cached robot + THEN it should return false + `, () => { + const duration = 1000; + const strategy = new ExpireAfterCachingStrategy(duration); + const now = Date.now(); + jest.spyOn(Date, 'now').mockReturnValue(now); + + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: now - 1500 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(false); + }); + + test(` + GIVEN duration is provided as a string + WHEN validating cached robot + THEN it should parse the string and validate correctly + `, () => { + const duration = '1s'; + const strategy = new ExpireAfterCachingStrategy(duration); + const now = Date.now(); + jest.spyOn(Date, 'now').mockReturnValue(now); + + // Not expired (500ms < 1s) + const cachedValid: CachedRobot = { + robot: {} as any, + fetchedAt: now - 500 + }; + expect(strategy.isValid(cachedValid)).toBe(true); + + // Expired (1500ms > 1s) + const cachedExpired: CachedRobot = { + robot: {} as any, + fetchedAt: now - 1500 + }; + expect(strategy.isValid(cachedExpired)).toBe(false); + }); +}); diff --git a/tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts b/tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts new file mode 100644 index 0000000..bf2172f --- /dev/null +++ b/tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts @@ -0,0 +1,25 @@ +import { IndefiniteCachingStrategy } from '../../../../../src/domain/strategies/caching/IndefiniteCachingStrategy'; +import { CachedRobot } from '../../../../../src/domain/models/CachedRobot'; + +describe('IndefiniteCachingStrategy', () => { + let strategy: IndefiniteCachingStrategy; + + beforeEach(() => { + strategy = new IndefiniteCachingStrategy(); + }); + + test(` + GIVEN any cached data + WHEN validating cached robot + THEN it should always return true + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now() - 10000000 // Old data + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); +}); diff --git a/tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts b/tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts new file mode 100644 index 0000000..54902d2 --- /dev/null +++ b/tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts @@ -0,0 +1,74 @@ +import { RequestCountCachingStrategy } from '../../../../../src/domain/strategies/caching/RequestCountCachingStrategy'; +import { CachedRobot } from '../../../../../src/domain/models/CachedRobot'; + +describe('RequestCountCachingStrategy', () => { + let strategy: RequestCountCachingStrategy; + const maxRequests = 5; + + beforeEach(() => { + strategy = new RequestCountCachingStrategy(maxRequests); + }); + + test(` + GIVEN usage count is less than max requests + WHEN validating cached robot + THEN it should return true + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now(), + usageCount: 4 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); + + test(` + GIVEN usage count is equal to max requests + WHEN validating cached robot + THEN it should return false + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now(), + usageCount: 5 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(false); + }); + + test(` + GIVEN usage count is greater than max requests + WHEN validating cached robot + THEN it should return false + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now(), + usageCount: 6 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(false); + }); + + test(` + GIVEN usage count is undefined + WHEN validating cached robot + THEN it should treat it as 0 and return true + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now() + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); +}); From bca356392d3c4567091973570d50fdf74eae35e3 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:25:01 +0900 Subject: [PATCH 03/10] add integration test --- tests/integration/caching.test.ts | 63 +++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/integration/caching.test.ts b/tests/integration/caching.test.ts index db22ff5..2df27d0 100644 --- a/tests/integration/caching.test.ts +++ b/tests/integration/caching.test.ts @@ -110,6 +110,69 @@ describe('Caching Policy Integration', () => { await client.get(`${DOMAIN}/second`); + expect(robotsScope.isDone()).toBe(true); + }); + test(` + GIVEN a requestCount caching policy of 2 requests + WHEN a third request is made + THEN robots.txt should be fetched again + `, async () => { + const initialTime = 1672531200000; + jest.spyOn(Date, 'now').mockReturnValue(initialTime); + + client = axios.create(); + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + cachingPolicy: { + type: CachingPolicyType.RequestCount, + maxRequests: 2 + } + }); + + const robotsScope = nock(DOMAIN) + .get('/robots.txt') + .times(2) + .reply(200, `User-agent: *\nAllow: /`); + + nock(DOMAIN).get('/first').reply(200, 'OK'); + nock(DOMAIN).get('/second').reply(200, 'OK'); + nock(DOMAIN).get('/third').reply(200, 'OK'); + + await client.get(`${DOMAIN}/first`); + await client.get(`${DOMAIN}/second`); + await client.get(`${DOMAIN}/third`); + + expect(robotsScope.isDone()).toBe(true); + }); + + test(` + GIVEN a requestCount caching policy of 2 requests + WHEN a second request is made + THEN robots.txt should NOT be fetched again + `, async () => { + const initialTime = 1672531200000; + jest.spyOn(Date, 'now').mockReturnValue(initialTime); + + client = axios.create(); + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + cachingPolicy: { + type: CachingPolicyType.RequestCount, + maxRequests: 2 + } + }); + + const robotsScope = nock(DOMAIN) + .get('/robots.txt') + .times(1) + .reply(200, `User-agent: *\nAllow: /`); + + nock(DOMAIN).get('/first').reply(200, 'OK'); + nock(DOMAIN).get('/second').reply(200, 'OK'); + + await client.get(`${DOMAIN}/first`); + await client.get(`${DOMAIN}/second`); + expect(robotsScope.isDone()).toBe(true); }); }); From c892b6a508dc53d87112685697e56f4982868651 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:25:28 +0900 Subject: [PATCH 04/10] refactor repository unit test --- .../repositories/RobotsDataRepository.test.ts | 118 ++++++++++++------ 1 file changed, 79 insertions(+), 39 deletions(-) diff --git a/tests/unit/data/repositories/RobotsDataRepository.test.ts b/tests/unit/data/repositories/RobotsDataRepository.test.ts index e47a9db..a7cacf8 100644 --- a/tests/unit/data/repositories/RobotsDataRepository.test.ts +++ b/tests/unit/data/repositories/RobotsDataRepository.test.ts @@ -1,25 +1,30 @@ import { RobotsDataRepository } from '../../../../src/data/repositories/RobotsDataRepository'; import { RobotsPluginOptions } from '../../../../src/domain/models/RobotsPluginOptions'; -import { CachingPolicyType } from '../../../../src/domain/models/CachingPolicyType'; +import { CachingStrategyFactory } from '../../../../src/domain/strategies/caching/CachingStrategyFactory'; import axios from 'axios'; import robotsParser from 'robots-parser'; jest.mock('axios'); jest.mock('robots-parser'); +jest.mock('../../../../src/domain/strategies/caching/CachingStrategyFactory'); const mockAxios = axios as unknown as jest.Mocked; const mockRobotsParser = robotsParser as unknown as jest.MockedFunction; +const mockStrategyFactory = CachingStrategyFactory as unknown as jest.MockedClass; describe('RobotsDataRepository', () => { let repository: RobotsDataRepository; const origin = 'https://example.com'; const userAgent = 'test-bot'; + let mockStrategy: { isValid: jest.Mock; }; beforeEach(() => { jest.clearAllMocks(); + mockAxios.create.mockReturnValue({ get: jest.fn().mockResolvedValue({ data: 'User-agent: *\nDisallow: /' }) } as any); + mockRobotsParser.mockReturnValue({ isAllowed: jest.fn(), isDisallowed: jest.fn(), @@ -28,72 +33,107 @@ describe('RobotsDataRepository', () => { getSitemaps: jest.fn(), getPreferredHost: jest.fn(), }); + + // Setup mock strategy + mockStrategy = { isValid: jest.fn() }; + mockStrategyFactory.prototype.getStrategy = jest.fn().mockReturnValue(mockStrategy); }); describe('Caching Behavior', () => { test(` - GIVEN an indefinite (default) caching policy - WHEN robots.txt is requested twice with a long time gap - THEN it should only fetch from the network once + GIVEN strategy says cache is valid + WHEN robots.txt is requested + THEN it should NOT fetch from network `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - const future = Date.now() + 100 * 24 * 60 * 60 * 1000; + // First call to populate cache await repository.getRobot(origin, userAgent); - jest.spyOn(Date, 'now').mockReturnValue(future); + + // Setup validation to return true + mockStrategy.isValid.mockReturnValue(true); + + // Second call await repository.getRobot(origin, userAgent); expect(mockAxios.create).toHaveBeenCalledTimes(1); + expect(mockStrategy.isValid).toHaveBeenCalled(); }); test(` - GIVEN an expireAfter caching policy - WHEN robots.txt is requested after the expiration duration - THEN it should fetch from the network again + GIVEN strategy says cache is invalid + WHEN robots.txt is requested + THEN it should fetch from network again `, async () => { - const duration = '5m'; - const durationMs = 5 * 60 * 1000; - repository = new RobotsDataRepository({ - userAgent, - cachingPolicy: { - type: CachingPolicyType.ExpireAfter, - duration - } - } as RobotsPluginOptions); - const initialTime = 1000; - const expiredTime = initialTime + durationMs + 1; - jest.spyOn(Date, 'now').mockReturnValue(initialTime); + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + // First call to populate cache await repository.getRobot(origin, userAgent); - jest.spyOn(Date, 'now').mockReturnValue(expiredTime); + + // Setup validation to return false + mockStrategy.isValid.mockReturnValue(false); + + // Second call await repository.getRobot(origin, userAgent); expect(mockAxios.create).toHaveBeenCalledTimes(2); + expect(mockStrategy.isValid).toHaveBeenCalled(); }); test(` - GIVEN an expireAfter caching policy - WHEN robots.txt is requested before the expiration duration - THEN it should return the cached data without refetching + GIVEN ignoreCachePolicy option is true + WHEN robots.txt is requested + THEN it should NOT check strategy validity and return cached if available `, async () => { - const duration = 5 * 60 * 1000; - const durationMs = duration; - repository = new RobotsDataRepository({ - userAgent, - cachingPolicy: { - type: CachingPolicyType.ExpireAfter, - duration - } - } as RobotsPluginOptions); - const initialTime = 1000; - const validTime = initialTime + durationMs - 1; - jest.spyOn(Date, 'now').mockReturnValue(initialTime); + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + // First call to populate cache await repository.getRobot(origin, userAgent); - jest.spyOn(Date, 'now').mockReturnValue(validTime); - await repository.getRobot(origin, userAgent); + + // Clear strategy calls to verify it's NOT called + mockStrategy.isValid.mockClear(); + + // Second call with ignoreCachePolicy + await repository.getRobot(origin, userAgent, { ignoreCachePolicy: true }); expect(mockAxios.create).toHaveBeenCalledTimes(1); + expect(mockStrategy.isValid).not.toHaveBeenCalled(); + }); + + test(` + GIVEN incrementUsage is true (default) + WHEN valid cached robot is returned + THEN usageCount should be incremented + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + // First call (usageCount = 1) + await repository.getRobot(origin, userAgent); + + mockStrategy.isValid.mockReturnValue(true); + + // Second call (usageCount = 2) + const result = await repository.getRobot(origin, userAgent); + + expect(result.usageCount).toBe(2); + }); + + test(` + GIVEN incrementUsage is false + WHEN valid cached robot is returned + THEN usageCount should NOT be incremented + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + // First call (usageCount = 1) + await repository.getRobot(origin, userAgent); + + mockStrategy.isValid.mockReturnValue(true); + + // Second call (usageCount should remain 1) + const result = await repository.getRobot(origin, userAgent, { incrementUsage: false }); + + expect(result.usageCount).toBe(1); }); }); }); From 8f4df6445111b24ea4bc26a43c9932a1606bfdc1 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:25:37 +0900 Subject: [PATCH 05/10] add documentation --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 24dd5cd..bfed0e6 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,12 @@ const timeBased = { type: CachingPolicyType.ExpireAfter, duration: '1h' // Supports strings ('5m', '1d', '200ms') or numbers (milliseconds) }; + +// Option 3: Request-based Expiration +const requestBased = { + type: CachingPolicyType.RequestCount, + maxRequests: 10 // Expire after 10 requests +}; ``` ### Error Handling From 1f5a29fbda860e43809f8a74664455755b7320f9 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:26:57 +0900 Subject: [PATCH 06/10] remove redundant default case --- src/domain/strategies/caching/CachingStrategyFactory.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/domain/strategies/caching/CachingStrategyFactory.ts b/src/domain/strategies/caching/CachingStrategyFactory.ts index 2c57f4f..1ff3d56 100644 --- a/src/domain/strategies/caching/CachingStrategyFactory.ts +++ b/src/domain/strategies/caching/CachingStrategyFactory.ts @@ -14,8 +14,6 @@ export class CachingStrategyFactory { return new ExpireAfterCachingStrategy(policy.duration); case CachingPolicyType.RequestCount: return new RequestCountCachingStrategy(policy.maxRequests); - default: - return new IndefiniteCachingStrategy(); } } } From e7745ac64d53f5f8d42807962971cbe0d7b47626 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:48:37 +0900 Subject: [PATCH 07/10] refactor getRobot --- src/data/repositories/RobotsDataRepository.ts | 46 +++++++++++++------ .../interfaces/IRobotsDataRepository.ts | 15 +++++- src/domain/services/AllowService.ts | 1 + .../caching/CachingStrategyFactory.ts | 2 + .../usecases/CalculateWaitTimeUseCase.ts | 6 ++- 5 files changed, 53 insertions(+), 17 deletions(-) diff --git a/src/data/repositories/RobotsDataRepository.ts b/src/data/repositories/RobotsDataRepository.ts index c20f1bc..a0e6196 100644 --- a/src/data/repositories/RobotsDataRepository.ts +++ b/src/data/repositories/RobotsDataRepository.ts @@ -19,38 +19,56 @@ export class RobotsDataRepository implements IRobotsDataRepository { this.strategyFactory = new CachingStrategyFactory(); } - async getRobot( - url: string, - userAgent: string = '*', - options: { incrementUsage?: boolean; ignoreCachePolicy?: boolean } = {} - ): Promise { - const { incrementUsage = true, ignoreCachePolicy = false } = options; + async getRobot(url: string, userAgent: string = '*'): Promise { const origin = new URL(url).origin; let cached = this.cache.get(origin); const strategy = this.strategyFactory.getStrategy(this.cachingPolicy); - const isValid = cached && (ignoreCachePolicy || strategy.isValid(cached)); + const isValid = cached && strategy.isValid(cached); - if (isValid && cached) { - if (incrementUsage) { - cached.usageCount = (cached.usageCount || 0) + 1; - } + if (cached && isValid) { return cached; } const robot = await this.fetchRobotsTxt(origin, userAgent); - cached = { robot, fetchedAt: Date.now(), usageCount: 1 }; + const previousLastCrawled = cached?.lastCrawled; + + cached = { + robot, + fetchedAt: Date.now(), + usageCount: 0 + }; + + if (previousLastCrawled) { + cached.lastCrawled = previousLastCrawled; + } + this.cache.set(origin, cached); return cached; } + getCachedRobot(url: string): CachedRobot | undefined { + const origin = new URL(url).origin; + return this.cache.get(origin); + } + + incrementUsage(url: string): void { + const origin = new URL(url).origin; + const cached = this.cache.get(origin); + if (!cached) { + return; + } + cached.usageCount = (cached.usageCount || 0) + 1; + } + setLastCrawled(url: string, timestamp: number): void { const origin = new URL(url).origin; const cached = this.cache.get(origin); - if (cached) { - cached.lastCrawled = timestamp; + if (!cached) { + return; } + cached.lastCrawled = timestamp; } private async fetchRobotsTxt(origin: string, userAgent: string): Promise { diff --git a/src/domain/interfaces/IRobotsDataRepository.ts b/src/domain/interfaces/IRobotsDataRepository.ts index 8d70f00..d023c82 100644 --- a/src/domain/interfaces/IRobotsDataRepository.ts +++ b/src/domain/interfaces/IRobotsDataRepository.ts @@ -8,10 +8,21 @@ export interface IRobotsDataRepository { * Retrieves the cached robot instance for a given URL. * @param url The URL to get the robot for (used to extract the domain/origin). * @param userAgent Optional user agent to use for fetching robots.txt if not cached. - * @param options Optional retrieval options. * @returns A promise resolving to the CachedRobot containing the parsed rules. */ - getRobot(url: string, userAgent?: string, options?: { incrementUsage?: boolean; ignoreCachePolicy?: boolean; }): Promise; + getRobot(url: string, userAgent?: string): Promise; + + /** + * Retrieves the robot from cache if available, without fetching or validating against strategy. + * @param url The URL to retrieve the robot for. + */ + getCachedRobot(url: string): CachedRobot | undefined; + + /** + * Increments the usage count for the cached robot associated with the URL. + * @param url The URL identifying the domain. + */ + incrementUsage(url: string): void; /** * Updates the last crawled timestamp for the domain associated with the URL. diff --git a/src/domain/services/AllowService.ts b/src/domain/services/AllowService.ts index 580b40b..cbb8b72 100644 --- a/src/domain/services/AllowService.ts +++ b/src/domain/services/AllowService.ts @@ -6,6 +6,7 @@ export class AllowService implements IAllowService { async isAllowed(url: string, userAgent: string = '*'): Promise { const robot = await this.dataService.getRobot(url, userAgent); + this.dataService.incrementUsage(url); if (!robot || !robot.robot) { return true; diff --git a/src/domain/strategies/caching/CachingStrategyFactory.ts b/src/domain/strategies/caching/CachingStrategyFactory.ts index 1ff3d56..2c57f4f 100644 --- a/src/domain/strategies/caching/CachingStrategyFactory.ts +++ b/src/domain/strategies/caching/CachingStrategyFactory.ts @@ -14,6 +14,8 @@ export class CachingStrategyFactory { return new ExpireAfterCachingStrategy(policy.duration); case CachingPolicyType.RequestCount: return new RequestCountCachingStrategy(policy.maxRequests); + default: + return new IndefiniteCachingStrategy(); } } } diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts index 072ec1e..87a4e48 100644 --- a/src/domain/usecases/CalculateWaitTimeUseCase.ts +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -5,7 +5,11 @@ export class CalculateWaitTimeUseCase { constructor(private dataService: IRobotsDataRepository) { } async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> { - const cachedRobot = await this.dataService.getRobot(url, userAgent, { incrementUsage: false, ignoreCachePolicy: true }); + let cachedRobot = this.dataService.getCachedRobot(url); + + if (!cachedRobot) { + cachedRobot = await this.dataService.getRobot(url, userAgent); + } if (!cachedRobot || !cachedRobot.robot) { return { waitTime: 0, delay: 0 }; From fad0a98a64699d4ec8303cc41e32d35b16806586 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:49:01 +0900 Subject: [PATCH 08/10] refactor tests --- .../repositories/RobotsDataRepository.test.ts | 70 ++++++++-------- .../unit/domain/services/AllowService.test.ts | 37 +++++---- .../usecases/CalculateWaitTimeUseCase.test.ts | 82 +++++++++++++------ 3 files changed, 112 insertions(+), 77 deletions(-) diff --git a/tests/unit/data/repositories/RobotsDataRepository.test.ts b/tests/unit/data/repositories/RobotsDataRepository.test.ts index a7cacf8..db67336 100644 --- a/tests/unit/data/repositories/RobotsDataRepository.test.ts +++ b/tests/unit/data/repositories/RobotsDataRepository.test.ts @@ -34,7 +34,6 @@ describe('RobotsDataRepository', () => { getPreferredHost: jest.fn(), }); - // Setup mock strategy mockStrategy = { isValid: jest.fn() }; mockStrategyFactory.prototype.getStrategy = jest.fn().mockReturnValue(mockStrategy); }); @@ -46,14 +45,10 @@ describe('RobotsDataRepository', () => { THEN it should NOT fetch from network `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - - // First call to populate cache await repository.getRobot(origin, userAgent); - // Setup validation to return true mockStrategy.isValid.mockReturnValue(true); - // Second call await repository.getRobot(origin, userAgent); expect(mockAxios.create).toHaveBeenCalledTimes(1); @@ -66,14 +61,10 @@ describe('RobotsDataRepository', () => { THEN it should fetch from network again `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - - // First call to populate cache await repository.getRobot(origin, userAgent); - // Setup validation to return false mockStrategy.isValid.mockReturnValue(false); - // Second call await repository.getRobot(origin, userAgent); expect(mockAxios.create).toHaveBeenCalledTimes(2); @@ -81,59 +72,62 @@ describe('RobotsDataRepository', () => { }); test(` - GIVEN ignoreCachePolicy option is true - WHEN robots.txt is requested - THEN it should NOT check strategy validity and return cached if available + GIVEN valid cached robot + WHEN incrementUsage is called + THEN usageCount should be incremented `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - // First call to populate cache - await repository.getRobot(origin, userAgent); + const cached1 = await repository.getRobot(origin, userAgent); + expect(cached1.usageCount).toBe(0); - // Clear strategy calls to verify it's NOT called - mockStrategy.isValid.mockClear(); + repository.incrementUsage(origin); + expect(cached1.usageCount).toBe(1); - // Second call with ignoreCachePolicy - await repository.getRobot(origin, userAgent, { ignoreCachePolicy: true }); + mockStrategy.isValid.mockReturnValue(true); - expect(mockAxios.create).toHaveBeenCalledTimes(1); - expect(mockStrategy.isValid).not.toHaveBeenCalled(); + const cached2 = await repository.getRobot(origin, userAgent); + expect(cached2).toBe(cached1); + expect(cached2.usageCount).toBe(1); + + repository.incrementUsage(origin); + expect(cached2.usageCount).toBe(2); }); test(` - GIVEN incrementUsage is true (default) - WHEN valid cached robot is returned - THEN usageCount should be incremented + GIVEN cached robot updates + WHEN cached robot is refreshed + THEN usageCount should reset to 0 `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - // First call (usageCount = 1) - await repository.getRobot(origin, userAgent); + const cached1 = await repository.getRobot(origin, userAgent); + repository.incrementUsage(origin); + expect(cached1.usageCount).toBe(1); - mockStrategy.isValid.mockReturnValue(true); + mockStrategy.isValid.mockReturnValue(false); - // Second call (usageCount = 2) - const result = await repository.getRobot(origin, userAgent); + const cached2 = await repository.getRobot(origin, userAgent); - expect(result.usageCount).toBe(2); + expect(cached2).not.toBe(cached1); + expect(cached2.usageCount).toBe(0); }); test(` - GIVEN incrementUsage is false - WHEN valid cached robot is returned - THEN usageCount should NOT be incremented + GIVEN cached robot exists + WHEN getCachedRobot is called + THEN it should return the cached robot without validation `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - // First call (usageCount = 1) - await repository.getRobot(origin, userAgent); + const cached1 = await repository.getRobot(origin, userAgent); - mockStrategy.isValid.mockReturnValue(true); + mockStrategy.isValid.mockReturnValue(false); - // Second call (usageCount should remain 1) - const result = await repository.getRobot(origin, userAgent, { incrementUsage: false }); + const result = repository.getCachedRobot(origin); - expect(result.usageCount).toBe(1); + expect(result).toBe(cached1); + expect(mockStrategy.isValid).not.toHaveBeenCalled(); }); }); }); diff --git a/tests/unit/domain/services/AllowService.test.ts b/tests/unit/domain/services/AllowService.test.ts index 0f63fa9..0fc7fe8 100644 --- a/tests/unit/domain/services/AllowService.test.ts +++ b/tests/unit/domain/services/AllowService.test.ts @@ -9,39 +9,43 @@ describe('AllowService', () => { beforeEach(() => { mockDataRepository = { getRobot: jest.fn(), + getCachedRobot: jest.fn(), + incrementUsage: jest.fn(), setLastCrawled: jest.fn(), }; service = new AllowService(mockDataRepository); }); test(` -GIVEN no robot data is found -WHEN checking if a URL is allowed -THEN it should return true (default allow) + GIVEN no robot data is found + WHEN checking if a URL is allowed + THEN it should return true (default allow) `, async () => { mockDataRepository.getRobot.mockResolvedValue(null as any); const result = await service.isAllowed('https://example.com/foo'); expect(result).toBe(true); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); test(` -GIVEN robot data exists but has no robot object -WHEN checking if a URL is allowed -THEN it should return true + GIVEN robot data exists but has no robot object + WHEN checking if a URL is allowed + THEN it should return true `, async () => { mockDataRepository.getRobot.mockResolvedValue({ robot: null } as unknown as CachedRobot); const result = await service.isAllowed('https://example.com/foo'); expect(result).toBe(true); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); test(` -GIVEN robot rules exist and allow the URL -WHEN checking if a URL is allowed -THEN it should return true + GIVEN robot rules exist and allow the URL + WHEN checking if a URL is allowed + THEN it should return true `, async () => { const mockRobot = { isAllowed: jest.fn().mockReturnValue(true) @@ -52,12 +56,13 @@ THEN it should return true expect(result).toBe(true); expect(mockRobot.isAllowed).toHaveBeenCalledWith('https://example.com/foo', '*'); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); test(` -GIVEN robot rules exist and disallow the URL -WHEN checking if a URL is allowed -THEN it should return false + GIVEN robot rules exist and disallow the URL + WHEN checking if a URL is allowed + THEN it should return false `, async () => { const mockRobot = { isAllowed: jest.fn().mockReturnValue(false) @@ -67,12 +72,13 @@ THEN it should return false const result = await service.isAllowed('https://example.com/private'); expect(result).toBe(false); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/private'); }); test(` -GIVEN robot rules exist but isAllowed returns undefined -WHEN checking if a URL is allowed -THEN it should return true (default to allowed) + GIVEN robot rules exist but isAllowed returns undefined + WHEN checking if a URL is allowed + THEN it should return true (default to allowed) `, async () => { const mockRobot = { isAllowed: jest.fn().mockReturnValue(undefined) @@ -82,5 +88,6 @@ THEN it should return true (default to allowed) const result = await service.isAllowed('https://example.com/foo'); expect(result).toBe(true); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); }); diff --git a/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts index 215dd1f..e41f004 100644 --- a/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts +++ b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts @@ -9,27 +9,63 @@ describe('CalculateWaitTimeUseCase', () => { beforeEach(() => { mockDataRepository = { getRobot: jest.fn(), + getCachedRobot: jest.fn(), + incrementUsage: jest.fn(), setLastCrawled: jest.fn(), }; useCase = new CalculateWaitTimeUseCase(mockDataRepository); }); test(` -GIVEN no robot data -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot is in cache + WHEN calculating wait time + THEN it should use cached robot without calling getRobot + `, async () => { + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(5) + }; + mockDataRepository.getCachedRobot.mockReturnValue({ + robot: mockRobot, + lastCrawled: Date.now() + } as unknown as CachedRobot); + + await useCase.execute('https://example.com', '*'); + + expect(mockDataRepository.getCachedRobot).toHaveBeenCalledWith('https://example.com'); + expect(mockDataRepository.getRobot).not.toHaveBeenCalled(); + }); + + test(` + GIVEN robot is NOT in cache + WHEN calculating wait time + THEN it should call getRobot + `, async () => { + mockDataRepository.getCachedRobot.mockReturnValue(undefined); + mockDataRepository.getRobot.mockResolvedValue({ robot: null } as any); + + await useCase.execute('https://example.com', '*'); + + expect(mockDataRepository.getCachedRobot).toHaveBeenCalledWith('https://example.com'); + expect(mockDataRepository.getRobot).toHaveBeenCalledWith('https://example.com', '*'); + }); + + test(` + GIVEN no robot data + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { mockDataRepository.getRobot.mockResolvedValue(null as any); const result = await useCase.execute('https://example.com', '*'); expect(result).toEqual({ waitTime: 0, delay: 0 }); + expect(mockDataRepository.incrementUsage).not.toHaveBeenCalled(); }); test(` -GIVEN robot data with no crawl delay -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot data with no crawl delay + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(undefined) @@ -42,14 +78,14 @@ THEN it should return 0 wait time }); test(` -GIVEN robot data with crawl delay but never crawled before -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot data with crawl delay but never crawled before + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(5) }; - mockDataRepository.getRobot.mockResolvedValue({ + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot, lastCrawled: undefined } as unknown as CachedRobot); @@ -60,22 +96,21 @@ THEN it should return 0 wait time }); test(` -GIVEN robot data with crawl delay and previously crawled recently -WHEN calculating wait time -THEN it should return the remaining wait time + GIVEN robot data with crawl delay and previously crawled recently + WHEN calculating wait time + THEN it should return the remaining wait time `, async () => { const delaySeconds = 2; const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) }; - - // Crawled 1000ms ago, delay is 2000ms, should wait 1000ms + const now = Date.now(); const lastCrawled = now - 1000; - + jest.spyOn(Date, 'now').mockReturnValue(now); - mockDataRepository.getRobot.mockResolvedValue({ + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot, lastCrawled: lastCrawled } as unknown as CachedRobot); @@ -87,22 +122,21 @@ THEN it should return the remaining wait time }); test(` -GIVEN robot data with crawl delay and previously crawled long ago -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot data with crawl delay and previously crawled long ago + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { const delaySeconds = 2; const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) }; - - // Crawled 3000ms ago, delay is 2000ms, should wait 0ms + const now = Date.now(); const lastCrawled = now - 3000; - + jest.spyOn(Date, 'now').mockReturnValue(now); - mockDataRepository.getRobot.mockResolvedValue({ + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot, lastCrawled: lastCrawled } as unknown as CachedRobot); From f9943c59e404ce1d2d46c590f7895ff3a052f912 Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:49:23 +0900 Subject: [PATCH 09/10] bump version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index c02fb56..4c7d178 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "axios-robots", - "version": "0.3.0", + "version": "0.4.0", "description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots", "main": "dist/index.js", "types": "dist/index.d.ts", From eba18fbfd97d2f94bc74e69befb5592d36af891a Mon Sep 17 00:00:00 2001 From: Gil Nobrega <82336674+gilnobrega@users.noreply.github.com> Date: Wed, 21 Jan 2026 01:03:40 +0900 Subject: [PATCH 10/10] improve test coverage --- src/interceptor.ts | 4 +- tests/integration/interceptor.test.ts | 70 +++++++++++++++++++ .../repositories/RobotsDataRepository.test.ts | 41 +++++++++++ tests/unit/errors/RobotsError.test.ts | 24 +++++++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 tests/unit/errors/RobotsError.test.ts diff --git a/src/interceptor.ts b/src/interceptor.ts index 630b8e3..f850bf0 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -39,7 +39,7 @@ export class RobotsInterceptor { * Intercepts Axios requests to enforce the Robots Exclusion Protocol. */ public async intercept(config: InternalAxiosRequestConfig): Promise { - if (!config.url) { + if (!config.url && !config.baseURL) { return config; } @@ -108,7 +108,7 @@ export class RobotsInterceptor { return new URL(config.url || '', config.baseURL); } - return new URL(config.url || ''); + return new URL(config.url as string); } catch (e: any) { throw new InvalidUrlError(e.message); } diff --git a/tests/integration/interceptor.test.ts b/tests/integration/interceptor.test.ts index 29179af..e4487b9 100644 --- a/tests/integration/interceptor.test.ts +++ b/tests/integration/interceptor.test.ts @@ -182,6 +182,76 @@ THEN it should send the configured User-Agent header expect(response.status).toBe(200); }); + + test(` + GIVEN a baseURL and a relative URL + WHEN a request is made + THEN it should resolve the full URL correctly + `, async () => { + nock(DOMAIN) + .get('/robots.txt') + .reply(200, `User-agent: *\nAllow: /relative`); + + nock(DOMAIN) + .get('/relative') + .reply(200, 'OK'); + + client.defaults.baseURL = DOMAIN; + const response = await client.get('/relative'); + + expect(response.status).toBe(200); + }); + + test(` + GIVEN a request with no URL + WHEN the interceptor runs + THEN it should return the config as-is + `, async () => { + const interceptor = (client.interceptors.request as any).handlers[0].fulfilled; + const config = { headers: {} }; + const result = await interceptor(config); + expect(result).toBe(config); + }); + + test(` + GIVEN a response with no config + WHEN the response interceptor runs + THEN it should return the response as-is + `, () => { + const interceptor = (client.interceptors.response as any).handlers[0].fulfilled; + const response = { data: 'ok' }; + const result = interceptor(response); + expect(result).toBe(response); + }); + + test(` + GIVEN a config with no headers + WHEN the interceptor runs + THEN it should not throw and should proceed + `, async () => { + const interceptor = (client.interceptors.request as any).handlers[0].fulfilled; + const config = { url: 'https://example.com' }; // No headers + const result = await interceptor(config); + // Should not set header, but return config + expect(result).toBe(config); + expect(config).not.toHaveProperty('headers'); + }); + + test(` + GIVEN a baseURL and an empty URL + WHEN a request is made + THEN it should resolve using just the baseURL + `, async () => { + const interceptor = (client.interceptors.request as any).handlers[0].fulfilled; + const config = { baseURL: DOMAIN, url: '' }; + + // Setup mock for the robots.txt request that the interceptor will trigger + const scope = nock(DOMAIN).get('/robots.txt').reply(200, 'User-agent: *\nAllow: /'); + + await interceptor(config); + + expect(scope.isDone()).toBe(true); + }); }); describe('Caching', () => { diff --git a/tests/unit/data/repositories/RobotsDataRepository.test.ts b/tests/unit/data/repositories/RobotsDataRepository.test.ts index db67336..15fa296 100644 --- a/tests/unit/data/repositories/RobotsDataRepository.test.ts +++ b/tests/unit/data/repositories/RobotsDataRepository.test.ts @@ -3,6 +3,7 @@ import { RobotsPluginOptions } from '../../../../src/domain/models/RobotsPluginO import { CachingStrategyFactory } from '../../../../src/domain/strategies/caching/CachingStrategyFactory'; import axios from 'axios'; import robotsParser from 'robots-parser'; +import { HEADER_USER_AGENT } from '../../../../src/constants'; jest.mock('axios'); jest.mock('robots-parser'); @@ -129,5 +130,45 @@ describe('RobotsDataRepository', () => { expect(result).toBe(cached1); expect(mockStrategy.isValid).not.toHaveBeenCalled(); }); + + test(` + GIVEN no cached robot + WHEN incrementUsage is called + THEN it should do nothing + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + repository.incrementUsage('https://unknown.com'); + + expect(repository.getCachedRobot('https://unknown.com')).toBeUndefined(); + }); + + test(` + GIVEN no cached robot + WHEN setLastCrawled is called + THEN it should do nothing + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + repository.setLastCrawled('https://unknown.com', Date.now()); + + expect(repository.getCachedRobot('https://unknown.com')).toBeUndefined(); + }); + + test(` + GIVEN no userAgent provided + WHEN getRobot is called + THEN it should default to wildchar + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + await repository.getRobot(origin); + + expect(mockAxios.create).toHaveBeenCalledWith(expect.objectContaining({ + headers: expect.objectContaining({ + [HEADER_USER_AGENT]: '*' + }) + })); + }); }); }); diff --git a/tests/unit/errors/RobotsError.test.ts b/tests/unit/errors/RobotsError.test.ts new file mode 100644 index 0000000..99d28c6 --- /dev/null +++ b/tests/unit/errors/RobotsError.test.ts @@ -0,0 +1,24 @@ +import { RobotsError, ERROR_MESSAGES } from '../../../src/errors'; + +describe('RobotsError', () => { + test(` + GIVEN no message provided + WHEN initialized + THEN it should use the default error message + `, () => { + const error = new RobotsError(); + expect(error.message).toBe(ERROR_MESSAGES.DEFAULT_BLOCK); + expect(error.name).toBe('RobotsError'); + }); + + test(` + GIVEN a custom message + WHEN initialized + THEN it should use the provided message + `, () => { + const customMessage = 'Custom error'; + const error = new RobotsError(customMessage); + expect(error.message).toBe(customMessage); + expect(error.name).toBe('RobotsError'); + }); +});