Skip to content
This repository was archived by the owner on Feb 14, 2026. It is now read-only.

Commit 31a8b3c

Browse files
Chris Arterclaude
andcommitted
Add website crawling and single-file URL support for refdocs add
Adds two new ingestion modes to `refdocs add`: - `--crawl` flag to spider a docs website, extract content with Readability, and convert to markdown with Turndown - Direct file URL support for llms.txt-style single files Includes 150ms request throttle, path-prefix scoping, 200 page default limit, and discriminated Source union (github/file/crawl) for tracking. Also indexes .txt files alongside .md. Bumps to v0.5.0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e96c1e9 commit 31a8b3c

11 files changed

Lines changed: 1500 additions & 79 deletions

File tree

package-lock.json

Lines changed: 870 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@dynamik-dev/refdocs",
3-
"version": "0.4.0",
3+
"version": "0.5.0",
44
"type": "module",
55
"description": "Local CLI tool that indexes markdown documentation and exposes fast fuzzy search with intelligent chunking",
66
"main": "dist/src/index.js",
@@ -24,16 +24,22 @@
2424
"url": "https://github.com/dynamik-dev/refdoc-cli.git"
2525
},
2626
"dependencies": {
27+
"@mozilla/readability": "^0.6.0",
28+
"cheerio": "^1.2.0",
2729
"commander": "^13.1.0",
30+
"jsdom": "^28.0.0",
2831
"mdast-util-from-markdown": "^2.0.2",
2932
"minisearch": "^7.1.1",
3033
"picomatch": "^4.0.2",
31-
"tar-stream": "^3.1.7"
34+
"tar-stream": "^3.1.7",
35+
"turndown": "^7.2.2"
3236
},
3337
"devDependencies": {
38+
"@types/jsdom": "^27.0.0",
3439
"@types/node": "^22.0.0",
3540
"@types/picomatch": "^3.0.1",
3641
"@types/tar-stream": "^3.1.4",
42+
"@types/turndown": "^5.0.6",
3743
"tsx": "^4.19.0",
3844
"typescript": "^5.7.3",
3945
"vitest": "^3.0.5"

src/add.ts

Lines changed: 141 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,15 @@ import { createGunzip } from "node:zlib";
66
import tar from "tar-stream";
77
import { parseGitHubUrl, downloadTarball } from "./github.js";
88
import { saveConfig } from "./config.js";
9-
import type { RefdocsConfig, Source } from "./types.js";
9+
import {
10+
isGitHubUrl,
11+
isTextFileUrl,
12+
deriveLocalPath,
13+
deriveCrawlDir,
14+
fetchSingleFile,
15+
crawlSite,
16+
} from "./crawl.js";
17+
import type { RefdocsConfig, Source, GitHubSource, FileSource, CrawlSource } from "./types.js";
1018

1119
export interface AddOptions {
1220
path?: string;
@@ -25,7 +33,13 @@ export interface UpdateResult {
2533
filesWritten: number;
2634
}
2735

28-
export async function addFromUrl(
36+
export interface CrawlAddOptions {
37+
path?: string;
38+
maxPages?: number;
39+
depth?: number;
40+
}
41+
42+
export async function addFromGitHub(
2943
url: string,
3044
options: AddOptions,
3145
configDir: string,
@@ -47,7 +61,8 @@ export async function addFromUrl(
4761
join(configDir, localPath),
4862
);
4963

50-
const source: Source = {
64+
const source: GitHubSource = {
65+
type: "github",
5166
url,
5267
owner: parsed.owner,
5368
repo: parsed.repo,
@@ -68,6 +83,74 @@ export async function addFromUrl(
6883
return { filesWritten, localPath, source };
6984
}
7085

86+
export async function addFromFileUrl(
87+
url: string,
88+
options: { path?: string },
89+
configDir: string,
90+
config: RefdocsConfig,
91+
): Promise<AddResult> {
92+
const localPath = options.path ?? deriveLocalPath(url);
93+
const fullPath = join(configDir, localPath);
94+
95+
const { content } = await fetchSingleFile(url);
96+
97+
mkdirSync(dirname(fullPath), { recursive: true });
98+
writeFileSync(fullPath, content, "utf-8");
99+
100+
const source: FileSource = {
101+
type: "file",
102+
url,
103+
localPath,
104+
addedAt: new Date().toISOString(),
105+
};
106+
107+
// Add the parent directory to paths (not the file itself)
108+
const pathDir = dirname(localPath);
109+
const paths = isPathCovered(config.paths, pathDir)
110+
? config.paths
111+
: [...config.paths, pathDir];
112+
113+
const sources = upsertSource(config.sources ?? [], source);
114+
115+
saveConfig({ paths, sources }, configDir);
116+
117+
return { filesWritten: 1, localPath, source };
118+
}
119+
120+
export async function addFromCrawl(
121+
url: string,
122+
options: CrawlAddOptions,
123+
configDir: string,
124+
config: RefdocsConfig,
125+
): Promise<AddResult> {
126+
const localPath = options.path ?? deriveCrawlDir(url);
127+
const outputDir = join(configDir, localPath);
128+
129+
const result = await crawlSite(url, outputDir, {
130+
maxPages: options.maxPages,
131+
depth: options.depth,
132+
});
133+
134+
const source: CrawlSource = {
135+
type: "crawl",
136+
url,
137+
scope: url,
138+
localPath,
139+
pagesCrawled: result.filesWritten,
140+
addedAt: new Date().toISOString(),
141+
};
142+
143+
const paths = isPathCovered(config.paths, localPath)
144+
? config.paths
145+
: [...config.paths, localPath];
146+
147+
const sources = upsertSource(config.sources ?? [], source);
148+
149+
saveConfig({ paths, sources }, configDir);
150+
151+
return { filesWritten: result.filesWritten, localPath, source };
152+
}
153+
71154
export async function updateSources(
72155
config: RefdocsConfig,
73156
configDir: string,
@@ -80,14 +163,48 @@ export async function updateSources(
80163

81164
const results: UpdateResult[] = [];
82165
for (const source of sources) {
83-
const ref = source.branch === "HEAD" ? undefined : source.branch;
84-
const tarball = await downloadTarball(source.owner, source.repo, ref, token);
85-
const filesWritten = await extractMarkdownFiles(
86-
Buffer.from(tarball),
87-
source.subpath,
88-
join(configDir, source.localPath),
89-
);
90-
results.push({ source, filesWritten });
166+
switch (source.type) {
167+
case "github": {
168+
const ref = source.branch === "HEAD" ? undefined : source.branch;
169+
const tarball = await downloadTarball(source.owner, source.repo, ref, token);
170+
const filesWritten = await extractMarkdownFiles(
171+
Buffer.from(tarball),
172+
source.subpath,
173+
join(configDir, source.localPath),
174+
);
175+
results.push({ source, filesWritten });
176+
break;
177+
}
178+
case "file": {
179+
const { content } = await fetchSingleFile(source.url);
180+
const fullPath = join(configDir, source.localPath);
181+
mkdirSync(dirname(fullPath), { recursive: true });
182+
writeFileSync(fullPath, content, "utf-8");
183+
results.push({ source, filesWritten: 1 });
184+
break;
185+
}
186+
case "crawl": {
187+
const outputDir = join(configDir, source.localPath);
188+
const crawlResult = await crawlSite(source.url, outputDir);
189+
results.push({ source, filesWritten: crawlResult.filesWritten });
190+
break;
191+
}
192+
default: {
193+
// Handle legacy sources without a type field
194+
const legacy = source as Record<string, unknown>;
195+
if (typeof legacy.owner === "string" && typeof legacy.repo === "string") {
196+
const ref = (legacy.branch as string) === "HEAD" ? undefined : legacy.branch as string;
197+
const tarball = await downloadTarball(legacy.owner as string, legacy.repo as string, ref, token);
198+
const filesWritten = await extractMarkdownFiles(
199+
Buffer.from(tarball),
200+
(legacy.subpath as string) ?? "",
201+
join(configDir, (legacy.localPath as string) ?? ""),
202+
);
203+
results.push({ source, filesWritten });
204+
}
205+
break;
206+
}
207+
}
91208
}
92209

93210
return results;
@@ -227,9 +344,18 @@ export function isPathCovered(existingPaths: string[], newPath: string): boolean
227344
}
228345

229346
function upsertSource(sources: Source[], newSource: Source): Source[] {
230-
const key = `${newSource.owner}/${newSource.repo}/${newSource.subpath}`;
231-
const filtered = sources.filter(
232-
(s) => `${s.owner}/${s.repo}/${s.subpath}` !== key,
233-
);
347+
const key = sourceKey(newSource);
348+
const filtered = sources.filter((s) => sourceKey(s) !== key);
234349
return [...filtered, newSource];
235350
}
351+
352+
function sourceKey(source: Source): string {
353+
switch (source.type) {
354+
case "github":
355+
return `github:${source.owner}/${source.repo}/${source.subpath}`;
356+
case "file":
357+
return `file:${source.url}`;
358+
case "crawl":
359+
return `crawl:${source.url}`;
360+
}
361+
}

src/chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ function extractText(node: Content): string {
250250
function fileTitle(filePath?: string): string {
251251
if (!filePath) return "Untitled";
252252
const name = filePath.split("/").pop() || filePath;
253-
return name.replace(/\.md$/i, "");
253+
return name.replace(/\.(md|txt)$/i, "");
254254
}
255255

256256
function makeChunk(

0 commit comments

Comments
 (0)