diff --git a/backend/tests/ragUtilities.fetchLimit.test.js b/backend/tests/ragUtilities.fetchLimit.test.js new file mode 100644 index 0000000..4a04739 --- /dev/null +++ b/backend/tests/ragUtilities.fetchLimit.test.js @@ -0,0 +1,67 @@ +import { describe, it, expect } from "vitest"; +import { safeFetchText } from "../utils/ragUtilities.js"; + +function createMockResponse({ contentType, contentLength, bodyChunks = [] }) { + const chunks = bodyChunks.map((chunk) => + typeof chunk === "string" ? new TextEncoder().encode(chunk) : chunk, + ); + let index = 0; + + return { + headers: { + get(name) { + const key = String(name).toLowerCase(); + if (key === "content-type") return contentType; + if (key === "content-length") return contentLength; + return null; + }, + }, + body: { + getReader() { + return { + async read() { + if (index >= chunks.length) { + return { done: true, value: undefined }; + } + return { done: false, value: chunks[index++] }; + }, + async cancel() { + index = chunks.length; + }, + }; + }, + }, + }; +} + +describe("safeFetchText", () => { + it("rejects payloads larger than the configured limit", async () => { + const response = createMockResponse({ + contentType: "text/html; charset=utf-8", + contentLength: String(5 * 1024 * 1024 + 1), + bodyChunks: ["too big"], + }); + + await expect(safeFetchText(response)).rejects.toThrow("Payload too large"); + }); + + it("rejects unsupported content types", async () => { + const response = createMockResponse({ + contentType: "application/octet-stream", + contentLength: "12", + bodyChunks: [new Uint8Array([1, 2, 3])], + }); + + await expect(safeFetchText(response)).rejects.toThrow("Unsupported content type"); + }); + + it("accepts valid html responses", async () => { + const response = createMockResponse({ + contentType: "text/html; charset=utf-8", + contentLength: "31", + bodyChunks: ["Hello"], + }); + + await expect(safeFetchText(response)).resolves.toBe("Hello"); + }); +}); diff --git a/backend/utils/ragUtilities.js b/backend/utils/ragUtilities.js index 888dec0..44bf17d 100644 --- a/backend/utils/ragUtilities.js +++ b/backend/utils/ragUtilities.js @@ -6,6 +6,12 @@ import robotsParser from "robots-parser"; const robotsCache = new Map(); const domainLimiters = new Map(); +const MAX_FETCH_SIZE_BYTES = 5 * 1024 * 1024; // 5 MB +const ALLOWED_CONTENT_TYPES = [ + "text/html", + "application/xhtml+xml", + "text/plain", +]; let openai; @@ -184,6 +190,53 @@ async function fetchTextWithTimeout(url, config) { } } +async function safeFetchText(response) { + const contentType = response.headers.get("content-type") || ""; + + const allowed = ALLOWED_CONTENT_TYPES.some((type) => + contentType.toLowerCase().includes(type), + ); + + if (!allowed) { + throw new Error(`Unsupported content type: ${contentType}`); + } + + const contentLength = Number(response.headers.get("content-length")); + + if ( + Number.isFinite(contentLength) + && contentLength > MAX_FETCH_SIZE_BYTES + ) { + throw new Error("Payload too large"); + } + + const reader = response.body?.getReader(); + + if (!reader) { + throw new Error("Unable to read response body"); + } + + const chunks = []; + let totalBytes = 0; + + while (true) { + const { done, value } = await reader.read(); + + if (done) break; + + totalBytes += value.length; + + if (totalBytes > MAX_FETCH_SIZE_BYTES) { + await reader.cancel(); + throw new Error("Payload too large"); + } + + chunks.push(Buffer.from(value)); + } + + return Buffer.concat(chunks).toString("utf8"); +} + async function fetchRobotsPolicy(origin, config) { if (!config.respectRobotsTxt) { return { @@ -211,7 +264,10 @@ async function fetchRobotsPolicy(origin, config) { return { parser: parseRobotsTxt("", robotsUrl), crawlDelayMs: null, failureReason: null }; } - const parser = parseRobotsTxt(await response.text(), robotsUrl); + const parser = parseRobotsTxt( + await safeFetchText(response), + robotsUrl, + ); return { parser, crawlDelayMs: getRobotsCrawlDelayMs(parser, config.userAgent), @@ -298,7 +354,7 @@ async function fetchCrawlText(urlString) { throw new Error(`Failed to fetch ${urlString}: HTTP ${response.status}`); } - return response.text(); + return safeFetchText(response); }); } @@ -556,6 +612,7 @@ export { isValidDocUrl, scrapeWebpage, scrapeTitle, + safeFetchText, generateVectorEmbeddings, getCrawlConfig, parseRobotsTxt,