diff --git a/README.md b/README.md index 24dd5cd..bfed0e6 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,12 @@ const timeBased = { type: CachingPolicyType.ExpireAfter, duration: '1h' // Supports strings ('5m', '1d', '200ms') or numbers (milliseconds) }; + +// Option 3: Request-based Expiration +const requestBased = { + type: CachingPolicyType.RequestCount, + maxRequests: 10 // Expire after 10 requests +}; ``` ### Error Handling diff --git a/package.json b/package.json index c02fb56..4c7d178 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "axios-robots", - "version": "0.3.0", + "version": "0.4.0", "description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/src/data/repositories/RobotsDataRepository.ts b/src/data/repositories/RobotsDataRepository.ts index 8dfca54..a0e6196 100644 --- a/src/data/repositories/RobotsDataRepository.ts +++ b/src/data/repositories/RobotsDataRepository.ts @@ -23,23 +23,52 @@ export class RobotsDataRepository implements IRobotsDataRepository { const origin = new URL(url).origin; let cached = this.cache.get(origin); - if (cached && this.strategyFactory.getStrategy(this.cachingPolicy).isValid(cached)) { + const strategy = this.strategyFactory.getStrategy(this.cachingPolicy); + const isValid = cached && strategy.isValid(cached); + + if (cached && isValid) { return cached; } const robot = await this.fetchRobotsTxt(origin, userAgent); - cached = { robot, fetchedAt: Date.now() }; + const previousLastCrawled = cached?.lastCrawled; + + cached = { + robot, + fetchedAt: Date.now(), + usageCount: 0 + }; + + if (previousLastCrawled) { + cached.lastCrawled = previousLastCrawled; + } + this.cache.set(origin, cached); return cached; } + getCachedRobot(url: string): CachedRobot | undefined { + const origin = new URL(url).origin; + return this.cache.get(origin); + } + + incrementUsage(url: string): void { + const origin = new URL(url).origin; + const cached = this.cache.get(origin); + if (!cached) { + return; + } + cached.usageCount = (cached.usageCount || 0) + 1; + } + setLastCrawled(url: string, timestamp: number): void { const origin = new URL(url).origin; const cached = this.cache.get(origin); - if (cached) { - cached.lastCrawled = timestamp; + if (!cached) { + return; } + cached.lastCrawled = timestamp; } private async fetchRobotsTxt(origin: string, userAgent: string): Promise { diff --git a/src/domain/interfaces/IRobotsDataRepository.ts b/src/domain/interfaces/IRobotsDataRepository.ts index 851d534..d023c82 100644 --- a/src/domain/interfaces/IRobotsDataRepository.ts +++ b/src/domain/interfaces/IRobotsDataRepository.ts @@ -12,6 +12,18 @@ export interface IRobotsDataRepository { */ getRobot(url: string, userAgent?: string): Promise; + /** + * Retrieves the robot from cache if available, without fetching or validating against strategy. + * @param url The URL to retrieve the robot for. + */ + getCachedRobot(url: string): CachedRobot | undefined; + + /** + * Increments the usage count for the cached robot associated with the URL. + * @param url The URL identifying the domain. + */ + incrementUsage(url: string): void; + /** * Updates the last crawled timestamp for the domain associated with the URL. * @param url The URL identifying the domain. diff --git a/src/domain/models/CachedRobot.ts b/src/domain/models/CachedRobot.ts index 982d40e..ba5485e 100644 --- a/src/domain/models/CachedRobot.ts +++ b/src/domain/models/CachedRobot.ts @@ -13,4 +13,8 @@ export interface CachedRobot { * Timestamp of when the robots.txt was fetched. */ fetchedAt: number; + /** + * Number of times this cached robot has been accessed. + */ + usageCount?: number; } diff --git a/src/domain/models/CachingPolicy.ts b/src/domain/models/CachingPolicy.ts index ffcf9d7..1fcd309 100644 --- a/src/domain/models/CachingPolicy.ts +++ b/src/domain/models/CachingPolicy.ts @@ -1,6 +1,6 @@ import { CachingPolicyType } from './CachingPolicyType'; -export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy; +export type CachingPolicy = IndefiniteCachingPolicy | ExpireAfterCachingPolicy | RequestCountCachingPolicy; export interface IndefiniteCachingPolicy { type: CachingPolicyType.Indefinite; @@ -13,3 +13,12 @@ export interface ExpireAfterCachingPolicy { */ duration: string | number; } + +export interface RequestCountCachingPolicy { + type: CachingPolicyType.RequestCount; + /** + * Maximum number of requests before the cache expires. + */ + maxRequests: number; +} + diff --git a/src/domain/models/CachingPolicyType.ts b/src/domain/models/CachingPolicyType.ts index 0eb5bd5..c252d73 100644 --- a/src/domain/models/CachingPolicyType.ts +++ b/src/domain/models/CachingPolicyType.ts @@ -9,5 +9,9 @@ export enum CachingPolicyType { /** * Cache robots.txt data for a specific duration. */ - ExpireAfter = 'expireAfter' + ExpireAfter = 'expireAfter', + /** + * Cache robots.txt data for a specific number of requests. + */ + RequestCount = 'requestCount' } diff --git a/src/domain/services/AllowService.ts b/src/domain/services/AllowService.ts index 580b40b..cbb8b72 100644 --- a/src/domain/services/AllowService.ts +++ b/src/domain/services/AllowService.ts @@ -6,6 +6,7 @@ export class AllowService implements IAllowService { async isAllowed(url: string, userAgent: string = '*'): Promise { const robot = await this.dataService.getRobot(url, userAgent); + this.dataService.incrementUsage(url); if (!robot || !robot.robot) { return true; diff --git a/src/domain/strategies/caching/CachingStrategyFactory.ts b/src/domain/strategies/caching/CachingStrategyFactory.ts index 745c6a3..2c57f4f 100644 --- a/src/domain/strategies/caching/CachingStrategyFactory.ts +++ b/src/domain/strategies/caching/CachingStrategyFactory.ts @@ -3,6 +3,7 @@ import { CachingPolicyType } from '../../models/CachingPolicyType'; import { ICachingStrategy } from './ICachingStrategy'; import { IndefiniteCachingStrategy } from './IndefiniteCachingStrategy'; import { ExpireAfterCachingStrategy } from './ExpireAfterCachingStrategy'; +import { RequestCountCachingStrategy } from './RequestCountCachingStrategy'; export class CachingStrategyFactory { getStrategy(policy: CachingPolicy): ICachingStrategy { @@ -11,6 +12,8 @@ export class CachingStrategyFactory { return new IndefiniteCachingStrategy(); case CachingPolicyType.ExpireAfter: return new ExpireAfterCachingStrategy(policy.duration); + case CachingPolicyType.RequestCount: + return new RequestCountCachingStrategy(policy.maxRequests); default: return new IndefiniteCachingStrategy(); } diff --git a/src/domain/strategies/caching/RequestCountCachingStrategy.ts b/src/domain/strategies/caching/RequestCountCachingStrategy.ts new file mode 100644 index 0000000..1b31b98 --- /dev/null +++ b/src/domain/strategies/caching/RequestCountCachingStrategy.ts @@ -0,0 +1,14 @@ +import { ICachingStrategy } from './ICachingStrategy'; +import { CachedRobot } from '../../models/CachedRobot'; + +export class RequestCountCachingStrategy implements ICachingStrategy { + private maxRequests: number; + + constructor(maxRequests: number) { + this.maxRequests = maxRequests; + } + + isValid(cached: CachedRobot): boolean { + return (cached.usageCount || 0) < this.maxRequests; + } +} diff --git a/src/domain/usecases/CalculateWaitTimeUseCase.ts b/src/domain/usecases/CalculateWaitTimeUseCase.ts index 9d0f640..87a4e48 100644 --- a/src/domain/usecases/CalculateWaitTimeUseCase.ts +++ b/src/domain/usecases/CalculateWaitTimeUseCase.ts @@ -5,7 +5,11 @@ export class CalculateWaitTimeUseCase { constructor(private dataService: IRobotsDataRepository) { } async execute(url: string, userAgent: string): Promise<{ waitTime: number; delay: number; }> { - const cachedRobot = await this.dataService.getRobot(url, userAgent); + let cachedRobot = this.dataService.getCachedRobot(url); + + if (!cachedRobot) { + cachedRobot = await this.dataService.getRobot(url, userAgent); + } if (!cachedRobot || !cachedRobot.robot) { return { waitTime: 0, delay: 0 }; diff --git a/src/interceptor.ts b/src/interceptor.ts index 630b8e3..f850bf0 100644 --- a/src/interceptor.ts +++ b/src/interceptor.ts @@ -39,7 +39,7 @@ export class RobotsInterceptor { * Intercepts Axios requests to enforce the Robots Exclusion Protocol. */ public async intercept(config: InternalAxiosRequestConfig): Promise { - if (!config.url) { + if (!config.url && !config.baseURL) { return config; } @@ -108,7 +108,7 @@ export class RobotsInterceptor { return new URL(config.url || '', config.baseURL); } - return new URL(config.url || ''); + return new URL(config.url as string); } catch (e: any) { throw new InvalidUrlError(e.message); } diff --git a/tests/integration/caching.test.ts b/tests/integration/caching.test.ts index db22ff5..2df27d0 100644 --- a/tests/integration/caching.test.ts +++ b/tests/integration/caching.test.ts @@ -110,6 +110,69 @@ describe('Caching Policy Integration', () => { await client.get(`${DOMAIN}/second`); + expect(robotsScope.isDone()).toBe(true); + }); + test(` + GIVEN a requestCount caching policy of 2 requests + WHEN a third request is made + THEN robots.txt should be fetched again + `, async () => { + const initialTime = 1672531200000; + jest.spyOn(Date, 'now').mockReturnValue(initialTime); + + client = axios.create(); + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + cachingPolicy: { + type: CachingPolicyType.RequestCount, + maxRequests: 2 + } + }); + + const robotsScope = nock(DOMAIN) + .get('/robots.txt') + .times(2) + .reply(200, `User-agent: *\nAllow: /`); + + nock(DOMAIN).get('/first').reply(200, 'OK'); + nock(DOMAIN).get('/second').reply(200, 'OK'); + nock(DOMAIN).get('/third').reply(200, 'OK'); + + await client.get(`${DOMAIN}/first`); + await client.get(`${DOMAIN}/second`); + await client.get(`${DOMAIN}/third`); + + expect(robotsScope.isDone()).toBe(true); + }); + + test(` + GIVEN a requestCount caching policy of 2 requests + WHEN a second request is made + THEN robots.txt should NOT be fetched again + `, async () => { + const initialTime = 1672531200000; + jest.spyOn(Date, 'now').mockReturnValue(initialTime); + + client = axios.create(); + applyRobotsInterceptor(client, { + userAgent: USER_AGENT, + cachingPolicy: { + type: CachingPolicyType.RequestCount, + maxRequests: 2 + } + }); + + const robotsScope = nock(DOMAIN) + .get('/robots.txt') + .times(1) + .reply(200, `User-agent: *\nAllow: /`); + + nock(DOMAIN).get('/first').reply(200, 'OK'); + nock(DOMAIN).get('/second').reply(200, 'OK'); + + await client.get(`${DOMAIN}/first`); + await client.get(`${DOMAIN}/second`); + expect(robotsScope.isDone()).toBe(true); }); }); diff --git a/tests/integration/interceptor.test.ts b/tests/integration/interceptor.test.ts index 29179af..e4487b9 100644 --- a/tests/integration/interceptor.test.ts +++ b/tests/integration/interceptor.test.ts @@ -182,6 +182,76 @@ THEN it should send the configured User-Agent header expect(response.status).toBe(200); }); + + test(` + GIVEN a baseURL and a relative URL + WHEN a request is made + THEN it should resolve the full URL correctly + `, async () => { + nock(DOMAIN) + .get('/robots.txt') + .reply(200, `User-agent: *\nAllow: /relative`); + + nock(DOMAIN) + .get('/relative') + .reply(200, 'OK'); + + client.defaults.baseURL = DOMAIN; + const response = await client.get('/relative'); + + expect(response.status).toBe(200); + }); + + test(` + GIVEN a request with no URL + WHEN the interceptor runs + THEN it should return the config as-is + `, async () => { + const interceptor = (client.interceptors.request as any).handlers[0].fulfilled; + const config = { headers: {} }; + const result = await interceptor(config); + expect(result).toBe(config); + }); + + test(` + GIVEN a response with no config + WHEN the response interceptor runs + THEN it should return the response as-is + `, () => { + const interceptor = (client.interceptors.response as any).handlers[0].fulfilled; + const response = { data: 'ok' }; + const result = interceptor(response); + expect(result).toBe(response); + }); + + test(` + GIVEN a config with no headers + WHEN the interceptor runs + THEN it should not throw and should proceed + `, async () => { + const interceptor = (client.interceptors.request as any).handlers[0].fulfilled; + const config = { url: 'https://example.com' }; // No headers + const result = await interceptor(config); + // Should not set header, but return config + expect(result).toBe(config); + expect(config).not.toHaveProperty('headers'); + }); + + test(` + GIVEN a baseURL and an empty URL + WHEN a request is made + THEN it should resolve using just the baseURL + `, async () => { + const interceptor = (client.interceptors.request as any).handlers[0].fulfilled; + const config = { baseURL: DOMAIN, url: '' }; + + // Setup mock for the robots.txt request that the interceptor will trigger + const scope = nock(DOMAIN).get('/robots.txt').reply(200, 'User-agent: *\nAllow: /'); + + await interceptor(config); + + expect(scope.isDone()).toBe(true); + }); }); describe('Caching', () => { diff --git a/tests/unit/data/repositories/RobotsDataRepository.test.ts b/tests/unit/data/repositories/RobotsDataRepository.test.ts index e47a9db..15fa296 100644 --- a/tests/unit/data/repositories/RobotsDataRepository.test.ts +++ b/tests/unit/data/repositories/RobotsDataRepository.test.ts @@ -1,25 +1,31 @@ import { RobotsDataRepository } from '../../../../src/data/repositories/RobotsDataRepository'; import { RobotsPluginOptions } from '../../../../src/domain/models/RobotsPluginOptions'; -import { CachingPolicyType } from '../../../../src/domain/models/CachingPolicyType'; +import { CachingStrategyFactory } from '../../../../src/domain/strategies/caching/CachingStrategyFactory'; import axios from 'axios'; import robotsParser from 'robots-parser'; +import { HEADER_USER_AGENT } from '../../../../src/constants'; jest.mock('axios'); jest.mock('robots-parser'); +jest.mock('../../../../src/domain/strategies/caching/CachingStrategyFactory'); const mockAxios = axios as unknown as jest.Mocked; const mockRobotsParser = robotsParser as unknown as jest.MockedFunction; +const mockStrategyFactory = CachingStrategyFactory as unknown as jest.MockedClass; describe('RobotsDataRepository', () => { let repository: RobotsDataRepository; const origin = 'https://example.com'; const userAgent = 'test-bot'; + let mockStrategy: { isValid: jest.Mock; }; beforeEach(() => { jest.clearAllMocks(); + mockAxios.create.mockReturnValue({ get: jest.fn().mockResolvedValue({ data: 'User-agent: *\nDisallow: /' }) } as any); + mockRobotsParser.mockReturnValue({ isAllowed: jest.fn(), isDisallowed: jest.fn(), @@ -28,72 +34,141 @@ describe('RobotsDataRepository', () => { getSitemaps: jest.fn(), getPreferredHost: jest.fn(), }); + + mockStrategy = { isValid: jest.fn() }; + mockStrategyFactory.prototype.getStrategy = jest.fn().mockReturnValue(mockStrategy); }); describe('Caching Behavior', () => { test(` - GIVEN an indefinite (default) caching policy - WHEN robots.txt is requested twice with a long time gap - THEN it should only fetch from the network once + GIVEN strategy says cache is valid + WHEN robots.txt is requested + THEN it should NOT fetch from network `, async () => { repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - const future = Date.now() + 100 * 24 * 60 * 60 * 1000; - await repository.getRobot(origin, userAgent); - jest.spyOn(Date, 'now').mockReturnValue(future); + + mockStrategy.isValid.mockReturnValue(true); + await repository.getRobot(origin, userAgent); expect(mockAxios.create).toHaveBeenCalledTimes(1); + expect(mockStrategy.isValid).toHaveBeenCalled(); }); test(` - GIVEN an expireAfter caching policy - WHEN robots.txt is requested after the expiration duration - THEN it should fetch from the network again + GIVEN strategy says cache is invalid + WHEN robots.txt is requested + THEN it should fetch from network again `, async () => { - const duration = '5m'; - const durationMs = 5 * 60 * 1000; - repository = new RobotsDataRepository({ - userAgent, - cachingPolicy: { - type: CachingPolicyType.ExpireAfter, - duration - } - } as RobotsPluginOptions); - const initialTime = 1000; - const expiredTime = initialTime + durationMs + 1; - jest.spyOn(Date, 'now').mockReturnValue(initialTime); - + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); await repository.getRobot(origin, userAgent); - jest.spyOn(Date, 'now').mockReturnValue(expiredTime); + + mockStrategy.isValid.mockReturnValue(false); + await repository.getRobot(origin, userAgent); expect(mockAxios.create).toHaveBeenCalledTimes(2); + expect(mockStrategy.isValid).toHaveBeenCalled(); }); test(` - GIVEN an expireAfter caching policy - WHEN robots.txt is requested before the expiration duration - THEN it should return the cached data without refetching + GIVEN valid cached robot + WHEN incrementUsage is called + THEN usageCount should be incremented `, async () => { - const duration = 5 * 60 * 1000; - const durationMs = duration; - repository = new RobotsDataRepository({ - userAgent, - cachingPolicy: { - type: CachingPolicyType.ExpireAfter, - duration - } - } as RobotsPluginOptions); - const initialTime = 1000; - const validTime = initialTime + durationMs - 1; - jest.spyOn(Date, 'now').mockReturnValue(initialTime); + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); - await repository.getRobot(origin, userAgent); - jest.spyOn(Date, 'now').mockReturnValue(validTime); - await repository.getRobot(origin, userAgent); + const cached1 = await repository.getRobot(origin, userAgent); + expect(cached1.usageCount).toBe(0); - expect(mockAxios.create).toHaveBeenCalledTimes(1); + repository.incrementUsage(origin); + expect(cached1.usageCount).toBe(1); + + mockStrategy.isValid.mockReturnValue(true); + + const cached2 = await repository.getRobot(origin, userAgent); + expect(cached2).toBe(cached1); + expect(cached2.usageCount).toBe(1); + + repository.incrementUsage(origin); + expect(cached2.usageCount).toBe(2); + }); + + test(` + GIVEN cached robot updates + WHEN cached robot is refreshed + THEN usageCount should reset to 0 + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + const cached1 = await repository.getRobot(origin, userAgent); + repository.incrementUsage(origin); + expect(cached1.usageCount).toBe(1); + + mockStrategy.isValid.mockReturnValue(false); + + const cached2 = await repository.getRobot(origin, userAgent); + + expect(cached2).not.toBe(cached1); + expect(cached2.usageCount).toBe(0); + }); + + test(` + GIVEN cached robot exists + WHEN getCachedRobot is called + THEN it should return the cached robot without validation + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + const cached1 = await repository.getRobot(origin, userAgent); + + mockStrategy.isValid.mockReturnValue(false); + + const result = repository.getCachedRobot(origin); + + expect(result).toBe(cached1); + expect(mockStrategy.isValid).not.toHaveBeenCalled(); + }); + + test(` + GIVEN no cached robot + WHEN incrementUsage is called + THEN it should do nothing + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + repository.incrementUsage('https://unknown.com'); + + expect(repository.getCachedRobot('https://unknown.com')).toBeUndefined(); + }); + + test(` + GIVEN no cached robot + WHEN setLastCrawled is called + THEN it should do nothing + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + repository.setLastCrawled('https://unknown.com', Date.now()); + + expect(repository.getCachedRobot('https://unknown.com')).toBeUndefined(); + }); + + test(` + GIVEN no userAgent provided + WHEN getRobot is called + THEN it should default to wildchar + `, async () => { + repository = new RobotsDataRepository({ userAgent } as RobotsPluginOptions); + + await repository.getRobot(origin); + + expect(mockAxios.create).toHaveBeenCalledWith(expect.objectContaining({ + headers: expect.objectContaining({ + [HEADER_USER_AGENT]: '*' + }) + })); }); }); }); diff --git a/tests/unit/domain/services/AllowService.test.ts b/tests/unit/domain/services/AllowService.test.ts index 0f63fa9..0fc7fe8 100644 --- a/tests/unit/domain/services/AllowService.test.ts +++ b/tests/unit/domain/services/AllowService.test.ts @@ -9,39 +9,43 @@ describe('AllowService', () => { beforeEach(() => { mockDataRepository = { getRobot: jest.fn(), + getCachedRobot: jest.fn(), + incrementUsage: jest.fn(), setLastCrawled: jest.fn(), }; service = new AllowService(mockDataRepository); }); test(` -GIVEN no robot data is found -WHEN checking if a URL is allowed -THEN it should return true (default allow) + GIVEN no robot data is found + WHEN checking if a URL is allowed + THEN it should return true (default allow) `, async () => { mockDataRepository.getRobot.mockResolvedValue(null as any); const result = await service.isAllowed('https://example.com/foo'); expect(result).toBe(true); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); test(` -GIVEN robot data exists but has no robot object -WHEN checking if a URL is allowed -THEN it should return true + GIVEN robot data exists but has no robot object + WHEN checking if a URL is allowed + THEN it should return true `, async () => { mockDataRepository.getRobot.mockResolvedValue({ robot: null } as unknown as CachedRobot); const result = await service.isAllowed('https://example.com/foo'); expect(result).toBe(true); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); test(` -GIVEN robot rules exist and allow the URL -WHEN checking if a URL is allowed -THEN it should return true + GIVEN robot rules exist and allow the URL + WHEN checking if a URL is allowed + THEN it should return true `, async () => { const mockRobot = { isAllowed: jest.fn().mockReturnValue(true) @@ -52,12 +56,13 @@ THEN it should return true expect(result).toBe(true); expect(mockRobot.isAllowed).toHaveBeenCalledWith('https://example.com/foo', '*'); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); test(` -GIVEN robot rules exist and disallow the URL -WHEN checking if a URL is allowed -THEN it should return false + GIVEN robot rules exist and disallow the URL + WHEN checking if a URL is allowed + THEN it should return false `, async () => { const mockRobot = { isAllowed: jest.fn().mockReturnValue(false) @@ -67,12 +72,13 @@ THEN it should return false const result = await service.isAllowed('https://example.com/private'); expect(result).toBe(false); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/private'); }); test(` -GIVEN robot rules exist but isAllowed returns undefined -WHEN checking if a URL is allowed -THEN it should return true (default to allowed) + GIVEN robot rules exist but isAllowed returns undefined + WHEN checking if a URL is allowed + THEN it should return true (default to allowed) `, async () => { const mockRobot = { isAllowed: jest.fn().mockReturnValue(undefined) @@ -82,5 +88,6 @@ THEN it should return true (default to allowed) const result = await service.isAllowed('https://example.com/foo'); expect(result).toBe(true); + expect(mockDataRepository.incrementUsage).toHaveBeenCalledWith('https://example.com/foo'); }); }); diff --git a/tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts b/tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts new file mode 100644 index 0000000..950d043 --- /dev/null +++ b/tests/unit/domain/strategies/caching/CachingStrategyFactory.test.ts @@ -0,0 +1,62 @@ +import { CachingStrategyFactory } from '../../../../../src/domain/strategies/caching/CachingStrategyFactory'; +import { IndefiniteCachingStrategy } from '../../../../../src/domain/strategies/caching/IndefiniteCachingStrategy'; +import { ExpireAfterCachingStrategy } from '../../../../../src/domain/strategies/caching/ExpireAfterCachingStrategy'; +import { RequestCountCachingStrategy } from '../../../../../src/domain/strategies/caching/RequestCountCachingStrategy'; +import { CachingPolicyType } from '../../../../../src/domain/models/CachingPolicyType'; + +describe('CachingStrategyFactory', () => { + let factory: CachingStrategyFactory; + + beforeEach(() => { + factory = new CachingStrategyFactory(); + }); + + test(` + GIVEN an indefinite caching policy + WHEN requesting a strategy + THEN it should return an IndefiniteCachingStrategy instance + `, () => { + const policy = { type: CachingPolicyType.Indefinite } as const; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(IndefiniteCachingStrategy); + }); + + test(` + GIVEN an expireAfter caching policy + WHEN requesting a strategy + THEN it should return an ExpireAfterCachingStrategy instance + `, () => { + const policy = { type: CachingPolicyType.ExpireAfter, duration: '1h' } as const; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(ExpireAfterCachingStrategy); + }); + + test(` + GIVEN a requestCount caching policy + WHEN requesting a strategy + THEN it should return a RequestCountCachingStrategy instance + `, () => { + const policy = { type: CachingPolicyType.RequestCount, maxRequests: 5 } as const; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(RequestCountCachingStrategy); + }); + + test(` + GIVEN an unknown caching policy type + WHEN requesting a strategy + THEN it should return an IndefiniteCachingStrategy instance (default) + `, () => { + // Casting to any to simulate invalid input that might bypass type checking or come from external sources + const policy = { type: 'unknown' as any }; + + const strategy = factory.getStrategy(policy); + + expect(strategy).toBeInstanceOf(IndefiniteCachingStrategy); + }); +}); diff --git a/tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts b/tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts new file mode 100644 index 0000000..04af5d0 --- /dev/null +++ b/tests/unit/domain/strategies/caching/ExpireAfterCachingStrategy.test.ts @@ -0,0 +1,70 @@ +import { ExpireAfterCachingStrategy } from '../../../../../src/domain/strategies/caching/ExpireAfterCachingStrategy'; +import { CachedRobot } from '../../../../../src/domain/models/CachedRobot'; + +describe('ExpireAfterCachingStrategy', () => { + + test(` + GIVEN cached data is not expired + WHEN validating cached robot + THEN it should return true + `, () => { + const duration = 1000; + const strategy = new ExpireAfterCachingStrategy(duration); + const now = Date.now(); + jest.spyOn(Date, 'now').mockReturnValue(now); + + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: now - 500 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); + + test(` + GIVEN cached data is expired + WHEN validating cached robot + THEN it should return false + `, () => { + const duration = 1000; + const strategy = new ExpireAfterCachingStrategy(duration); + const now = Date.now(); + jest.spyOn(Date, 'now').mockReturnValue(now); + + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: now - 1500 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(false); + }); + + test(` + GIVEN duration is provided as a string + WHEN validating cached robot + THEN it should parse the string and validate correctly + `, () => { + const duration = '1s'; + const strategy = new ExpireAfterCachingStrategy(duration); + const now = Date.now(); + jest.spyOn(Date, 'now').mockReturnValue(now); + + // Not expired (500ms < 1s) + const cachedValid: CachedRobot = { + robot: {} as any, + fetchedAt: now - 500 + }; + expect(strategy.isValid(cachedValid)).toBe(true); + + // Expired (1500ms > 1s) + const cachedExpired: CachedRobot = { + robot: {} as any, + fetchedAt: now - 1500 + }; + expect(strategy.isValid(cachedExpired)).toBe(false); + }); +}); diff --git a/tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts b/tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts new file mode 100644 index 0000000..bf2172f --- /dev/null +++ b/tests/unit/domain/strategies/caching/IndefiniteCachingStrategy.test.ts @@ -0,0 +1,25 @@ +import { IndefiniteCachingStrategy } from '../../../../../src/domain/strategies/caching/IndefiniteCachingStrategy'; +import { CachedRobot } from '../../../../../src/domain/models/CachedRobot'; + +describe('IndefiniteCachingStrategy', () => { + let strategy: IndefiniteCachingStrategy; + + beforeEach(() => { + strategy = new IndefiniteCachingStrategy(); + }); + + test(` + GIVEN any cached data + WHEN validating cached robot + THEN it should always return true + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now() - 10000000 // Old data + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); +}); diff --git a/tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts b/tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts new file mode 100644 index 0000000..54902d2 --- /dev/null +++ b/tests/unit/domain/strategies/caching/RequestCountCachingStrategy.test.ts @@ -0,0 +1,74 @@ +import { RequestCountCachingStrategy } from '../../../../../src/domain/strategies/caching/RequestCountCachingStrategy'; +import { CachedRobot } from '../../../../../src/domain/models/CachedRobot'; + +describe('RequestCountCachingStrategy', () => { + let strategy: RequestCountCachingStrategy; + const maxRequests = 5; + + beforeEach(() => { + strategy = new RequestCountCachingStrategy(maxRequests); + }); + + test(` + GIVEN usage count is less than max requests + WHEN validating cached robot + THEN it should return true + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now(), + usageCount: 4 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); + + test(` + GIVEN usage count is equal to max requests + WHEN validating cached robot + THEN it should return false + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now(), + usageCount: 5 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(false); + }); + + test(` + GIVEN usage count is greater than max requests + WHEN validating cached robot + THEN it should return false + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now(), + usageCount: 6 + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(false); + }); + + test(` + GIVEN usage count is undefined + WHEN validating cached robot + THEN it should treat it as 0 and return true + `, () => { + const cached: CachedRobot = { + robot: {} as any, + fetchedAt: Date.now() + }; + + const result = strategy.isValid(cached); + + expect(result).toBe(true); + }); +}); diff --git a/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts index 215dd1f..e41f004 100644 --- a/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts +++ b/tests/unit/domain/usecases/CalculateWaitTimeUseCase.test.ts @@ -9,27 +9,63 @@ describe('CalculateWaitTimeUseCase', () => { beforeEach(() => { mockDataRepository = { getRobot: jest.fn(), + getCachedRobot: jest.fn(), + incrementUsage: jest.fn(), setLastCrawled: jest.fn(), }; useCase = new CalculateWaitTimeUseCase(mockDataRepository); }); test(` -GIVEN no robot data -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot is in cache + WHEN calculating wait time + THEN it should use cached robot without calling getRobot + `, async () => { + const mockRobot = { + getCrawlDelay: jest.fn().mockReturnValue(5) + }; + mockDataRepository.getCachedRobot.mockReturnValue({ + robot: mockRobot, + lastCrawled: Date.now() + } as unknown as CachedRobot); + + await useCase.execute('https://example.com', '*'); + + expect(mockDataRepository.getCachedRobot).toHaveBeenCalledWith('https://example.com'); + expect(mockDataRepository.getRobot).not.toHaveBeenCalled(); + }); + + test(` + GIVEN robot is NOT in cache + WHEN calculating wait time + THEN it should call getRobot + `, async () => { + mockDataRepository.getCachedRobot.mockReturnValue(undefined); + mockDataRepository.getRobot.mockResolvedValue({ robot: null } as any); + + await useCase.execute('https://example.com', '*'); + + expect(mockDataRepository.getCachedRobot).toHaveBeenCalledWith('https://example.com'); + expect(mockDataRepository.getRobot).toHaveBeenCalledWith('https://example.com', '*'); + }); + + test(` + GIVEN no robot data + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { mockDataRepository.getRobot.mockResolvedValue(null as any); const result = await useCase.execute('https://example.com', '*'); expect(result).toEqual({ waitTime: 0, delay: 0 }); + expect(mockDataRepository.incrementUsage).not.toHaveBeenCalled(); }); test(` -GIVEN robot data with no crawl delay -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot data with no crawl delay + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(undefined) @@ -42,14 +78,14 @@ THEN it should return 0 wait time }); test(` -GIVEN robot data with crawl delay but never crawled before -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot data with crawl delay but never crawled before + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(5) }; - mockDataRepository.getRobot.mockResolvedValue({ + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot, lastCrawled: undefined } as unknown as CachedRobot); @@ -60,22 +96,21 @@ THEN it should return 0 wait time }); test(` -GIVEN robot data with crawl delay and previously crawled recently -WHEN calculating wait time -THEN it should return the remaining wait time + GIVEN robot data with crawl delay and previously crawled recently + WHEN calculating wait time + THEN it should return the remaining wait time `, async () => { const delaySeconds = 2; const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) }; - - // Crawled 1000ms ago, delay is 2000ms, should wait 1000ms + const now = Date.now(); const lastCrawled = now - 1000; - + jest.spyOn(Date, 'now').mockReturnValue(now); - mockDataRepository.getRobot.mockResolvedValue({ + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot, lastCrawled: lastCrawled } as unknown as CachedRobot); @@ -87,22 +122,21 @@ THEN it should return the remaining wait time }); test(` -GIVEN robot data with crawl delay and previously crawled long ago -WHEN calculating wait time -THEN it should return 0 wait time + GIVEN robot data with crawl delay and previously crawled long ago + WHEN calculating wait time + THEN it should return 0 wait time `, async () => { const delaySeconds = 2; const mockRobot = { getCrawlDelay: jest.fn().mockReturnValue(delaySeconds) }; - - // Crawled 3000ms ago, delay is 2000ms, should wait 0ms + const now = Date.now(); const lastCrawled = now - 3000; - + jest.spyOn(Date, 'now').mockReturnValue(now); - mockDataRepository.getRobot.mockResolvedValue({ + mockDataRepository.getRobot.mockResolvedValue({ robot: mockRobot, lastCrawled: lastCrawled } as unknown as CachedRobot); diff --git a/tests/unit/errors/RobotsError.test.ts b/tests/unit/errors/RobotsError.test.ts new file mode 100644 index 0000000..99d28c6 --- /dev/null +++ b/tests/unit/errors/RobotsError.test.ts @@ -0,0 +1,24 @@ +import { RobotsError, ERROR_MESSAGES } from '../../../src/errors'; + +describe('RobotsError', () => { + test(` + GIVEN no message provided + WHEN initialized + THEN it should use the default error message + `, () => { + const error = new RobotsError(); + expect(error.message).toBe(ERROR_MESSAGES.DEFAULT_BLOCK); + expect(error.name).toBe('RobotsError'); + }); + + test(` + GIVEN a custom message + WHEN initialized + THEN it should use the provided message + `, () => { + const customMessage = 'Custom error'; + const error = new RobotsError(customMessage); + expect(error.message).toBe(customMessage); + expect(error.name).toBe('RobotsError'); + }); +});