hyperfluid-tech
diff --git a/‎README.md‎
Lines changed: 10 additions & 2 deletions b/‎README.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎package.json‎
Lines changed: 1 addition & 1 deletion b/‎package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/data/repositories/RobotsDataRepository.ts‎
Lines changed: 57 additions & 0 deletions b/‎src/data/repositories/RobotsDataRepository.ts‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/domain/RobotsService.ts‎
Lines changed: 0 additions & 52 deletions b/‎src/domain/RobotsService.ts‎
Lines changed: 0 additions & 52 deletions
diff --git a/‎src/domain/interfaces/IAllowService.ts‎
Lines changed: 12 additions & 0 deletions b/‎src/domain/interfaces/IAllowService.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/domain/interfaces/ICrawlDelayService.ts‎
Lines changed: 15 additions & 0 deletions b/‎src/domain/interfaces/ICrawlDelayService.ts‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/domain/interfaces/IRobotsDataRepository.ts‎
Lines changed: 21 additions & 0 deletions b/‎src/domain/interfaces/IRobotsDataRepository.ts‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/domain/models/CachedRobot.ts‎
Lines changed: 12 additions & 0 deletions b/‎src/domain/models/CachedRobot.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/domain/models/CrawlDelayComplianceMode.ts‎
Lines changed: 17 additions & 0 deletions b/‎src/domain/models/CrawlDelayComplianceMode.ts‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/domain/models/RobotsPluginOptions.ts‎
Lines changed: 13 additions & 0 deletions b/‎src/domain/models/RobotsPluginOptions.ts‎
Lines changed: 13 additions & 0 deletions
@@ -12,6 +12,7 @@ Ensures your bot plays by the rules defined by website owners, preventing unauth
 ## Features
 
 - **🚀 Automated Compliance**: Validates every request against `robots.txt` rules (cached per origin).
+- **⏱️ Crawl-Delay**: Option to automatically wait before requests if `Crawl-delay` is specified.
 - **🛡️ Strict Mode**: invalid URLs, non-HTTP/S protocols, or unreachable `robots.txt` files (non-4xx error) block requests by default.
 - **✨ Clean Architecture**: built with maintainability and separation of concerns in mind.
 - **🔌 Plug-and-Play**: easily attaches to any Axios instance.
@@ -43,7 +44,7 @@ const client = axios.create();
 
 // Apply the interceptor
 applyRobotsInterceptor(client, { 
-    userAgent: 'MyCoolBot/1.0' 
+    userAgent: 'MyCoolBot/1.0',
 });
 
 async function crawl() {
@@ -81,6 +82,13 @@ Attaches the interceptor to the provided Axios instance.
 ```typescript
 interface RobotsPluginOptions {
   userAgent: string;
+  crawlDelayCompliance?: CrawlDelayComplianceMode; // default: CrawlDelayComplianceMode.Await
+}
+
+enum CrawlDelayComplianceMode {
+  Await = 'await',   // Respects delay by waiting
+  Ignore = 'ignore', // Ignores delay
+  Failure = 'failure' // Throws Error if delay is not met
 }
 ```
 
@@ -111,9 +119,9 @@ The interceptor throws a `RobotsError` in the following cases:
 - [x] **[RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html) Compliance**: Full support for the standard Robots Exclusion Protocol.
 - [x] **Standard Directives**: Supports `User-agent`, `Allow`, and `Disallow`.
 - [x] **Wildcards**: Supports standard path matching including `*` and `$`.
+- [x] **Crawl-delay**: The interceptor enforces `Crawl-delay` directives (automatic throttling) if configured.
 
 ### 🚧 Missing / TODO
-- [ ] **Crawl-delay**: The interceptor currently does **not** enforce `Crawl-delay` directives (automatic throttling).
 - [ ] **Sitemap**: Does not currently expose or parse `Sitemap` directives for the consumer.
 - [ ] **Cache TTL**: Caching is currently indefinite for the lifecycle of the Axios instance.
 
 
@@ -1,6 +1,6 @@
 {
   "name": "axios-robots",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
 
@@ -0,0 +1,57 @@
+import robotsParser, { Robot } from 'robots-parser';
+import axios from 'axios';
+import { HEADER_USER_AGENT, ROBOTS_TXT_FILENAME, ALLOW_ALL_ROBOTS_TXT_CONTENT } from '../../constants';
+import { RobotsUnreachableError } from '../../errors/RobotsUnreachableError';
+import { IRobotsDataRepository } from '../../domain/interfaces/IRobotsDataRepository';
+import { CachedRobot } from '../../domain/models/CachedRobot';
+
+export class RobotsDataRepository implements IRobotsDataRepository {
+    private cache: Map<string, CachedRobot> = new Map();
+
+    async getRobot(url: string, userAgent: string = '*'): Promise<CachedRobot> {
+        const origin = new URL(url).origin;
+        let cached = this.cache.get(origin);
+
+        if (cached)
+            return cached;
+
+        const robot = await this.fetchRobotsTxt(origin, userAgent);
+        cached = { robot };
+        this.cache.set(origin, cached);
+
+        return cached;
+    }
+
+    setLastCrawled(url: string, timestamp: number): void {
+        const origin = new URL(url).origin;
+        const cached = this.cache.get(origin);
+        if (cached) {
+            cached.lastCrawled = timestamp;
+        }
+    }
+
+    private async fetchRobotsTxt(origin: string, userAgent: string): Promise<Robot> {
+        const robotsUrl = `${origin}/${ROBOTS_TXT_FILENAME}`;
+
+        const internalClient = axios.create({
+            headers: {
+                [HEADER_USER_AGENT]: userAgent,
+            }
+        });
+
+        try {
+            const response = await internalClient.get(robotsUrl);
+            return robotsParser(robotsUrl, response.data);
+        } catch (error: any) {
+            if (this.isUnavailable(error)) {
+                return robotsParser(robotsUrl, ALLOW_ALL_ROBOTS_TXT_CONTENT);
+            }
+
+            throw new RobotsUnreachableError(error.message);
+        }
+    }
+
+    private isUnavailable(error: any): boolean {
+        return error.response && error.response.status >= 400 && error.response.status < 500;
+    }
+}
@@ -0,0 +1,12 @@
+/**
+ * Service for checking if a URL is allowed to be crawled according to robots.txt rules.
+ */
+export interface IAllowService {
+    /**
+     * Checks if the given URL is allowed for the specified user agent.
+     * @param url The URL to check.
+     * @param userAgent The user agent to check against.
+     * @returns A promise resolving to true if allowed, false otherwise.
+     */
+    isAllowed(url: string, userAgent?: string): Promise<boolean>;
+}
@@ -0,0 +1,15 @@
+import { CrawlDelayComplianceMode } from '../models/CrawlDelayComplianceMode';
+
+/**
+ * Service for handling Crawl-delay directives from robots.txt.
+ */
+export interface ICrawlDelayService {
+    /**
+     * Enforces the crawl delay for a given URL based on the compliance mode.
+     * @param url The URL about to be requested.
+     * @param userAgent The user agent to check rules for.
+     * @param complianceMode The mode determining how to handle the delay (Await, Ignore, Failure).
+     * @returns A promise that resolves when it is safe to proceed (or throws if in Failure mode).
+     */
+    handleCrawlDelay(url: string, userAgent: string, complianceMode: CrawlDelayComplianceMode): Promise<void>;
+}
@@ -0,0 +1,21 @@
+import { CachedRobot } from '../models/CachedRobot';
+
+/**
+ * Repository for managing robots.txt data and crawl timestamps independently of the protocol logic.
+ */
+export interface IRobotsDataRepository {
+    /**
+     * Retrieves the cached robot instance for a given URL.
+     * @param url The URL to get the robot for (used to extract the domain/origin).
+     * @param userAgent Optional user agent to use for fetching robots.txt if not cached.
+     * @returns A promise resolving to the CachedRobot containing the parsed rules.
+     */
+    getRobot(url: string, userAgent?: string): Promise<CachedRobot>;
+
+    /**
+     * Updates the last crawled timestamp for the domain associated with the URL.
+     * @param url The URL identifying the domain.
+     * @param timestamp The timestamp to set.
+     */
+    setLastCrawled(url: string, timestamp: number): void;
+}
@@ -0,0 +1,12 @@
+import { Robot } from 'robots-parser';
+
+export interface CachedRobot {
+    /**
+     * The parsed robots.txt object.
+     */
+    robot: Robot;
+    /**
+     * Timestamp of the last crawl for this domain.
+     */
+    lastCrawled?: number;
+}
@@ -0,0 +1,17 @@
+/**
+ * Options for the Robots Exclusion Protocol plugin.
+ */
+export enum CrawlDelayComplianceMode {
+    /**
+     * Respects the Crawl-delay directive by waiting before making the request.
+     */
+    Await = 'await',
+    /**
+     * Ignores the Crawl-delay directive.
+     */
+    Ignore = 'ignore',
+    /**
+     * Throws an error if the request violates the Crawl-delay.
+     */
+    Failure = 'failure'
+}
@@ -0,0 +1,13 @@
+import { CrawlDelayComplianceMode } from './CrawlDelayComplianceMode';
+
+export interface RobotsPluginOptions {
+    /**
+     * The User-Agent string to use when checking robots.txt rules.
+     */
+    userAgent: string;
+    /**
+     * How to handle Crawl-delay directives.
+     * Defaults to CrawlDelayComplianceMode.Await
+     */
+    crawlDelayCompliance?: CrawlDelayComplianceMode;
+}
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "axios-robots",`
`3`		`- "version": "0.1.0",`
	`3`	`+ "version": "0.2.0",`
`4`	`4`	`"description": "A lightweight Axios interceptor that enforces robots.txt compliance for web scrapers and bots",`
`5`	`5`	`"main": "dist/index.js",`
`6`	`6`	`"types": "dist/index.d.ts",`