diff --git a/components/ui/Autocomplete.tsx b/components/ui/Autocomplete.tsx index 36ea233c4..0b75cb1e5 100644 --- a/components/ui/Autocomplete.tsx +++ b/components/ui/Autocomplete.tsx @@ -29,7 +29,11 @@ import { MenuItem } from "@telegraph/menu"; import { Tag } from "@telegraph/tag"; import { Code, Text } from "@telegraph/typography"; -import { DocsSearchItem, EndpointSearchItem } from "@/types"; +import { + DocsSearchItem, + EndpointSearchItem, + EnhancedDocsSearchItem, +} from "@/types"; import { useInkeepModal } from "../AiChatButton"; import { useAskAi } from "../AskAiContext"; @@ -62,7 +66,9 @@ function createAskAiPrompt(query: string): string { return `Can you tell me about ${query}`; } -type ResultItem = (DocsSearchItem & BaseItem) | (EndpointSearchItem & BaseItem); +type ResultItem = + | (EnhancedDocsSearchItem & BaseItem) + | (EndpointSearchItem & BaseItem); const algoliaAppId = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID || ""; const algoliaSearchApiKey = @@ -175,6 +181,9 @@ const DocsSearchResult = ({ const href = `/${item.path}`; const isApiRef = isApiReferencePath(item.path); + const enhancedItem = item as EnhancedDocsSearchItem; + const showPageTitle = !enhancedItem.isPageLevel && enhancedItem.pageTitle; + const content = ( @@ -193,6 +202,7 @@ const DocsSearchResult = ({ )} + {showPageTitle ? `${enhancedItem.pageTitle} ยท ` : ""} {item.section} @@ -823,7 +833,7 @@ const Autocomplete = () => { /> ) : ( autocomplete.setQuery("")} /> )} diff --git a/lib/content.server.ts b/lib/content.server.ts index ad0d45e03..0d1ae2ad6 100644 --- a/lib/content.server.ts +++ b/lib/content.server.ts @@ -1,16 +1,9 @@ import fs from "fs"; import path from "path"; -import algoliasearch from "algoliasearch"; -import type { FrontMatter, DocsSearchItem } from "../types"; export const CONTENT_DIR = "content/"; export const DOCS_FILE_EXTENSIONS = [".mdx", ".md"]; -/** - * This is to index our .md and .mdx file content. - * API/mAPI reference content is indexed at script/indexApisForSearch.ts. - */ - export const getAllFilesInDir = ( directory: string, files: string[] = [], @@ -31,48 +24,5 @@ export const getAllFilesInDir = ( }; export function makeIdFromPath(resourcePath) { - return resourcePath.replace(/\.mdx?$/, "").replace("/index", ""); -} - -export async function generateAlgoliaIndex(frontmatter: FrontMatter) { - const algoliaAppId = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID ?? ""; - const algoliaAdminApiKey = process.env.ALGOLIA_ADMIN_API_KEY ?? ""; - const algoliaIndexName = process.env.NEXT_PUBLIC_ALGOLIA_INDEX_NAME ?? ""; - - if (algoliaAppId && algoliaAdminApiKey && algoliaIndexName) { - const client = algoliasearch(algoliaAppId, algoliaAdminApiKey); - const index = client.initIndex(algoliaIndexName); - - try { - // Notes: - // Algolia recommends saving objects in batches because of efficiency. - // Our markdown processor doesn't provide a callback to subscribe to that - // gets called after finishing with all elements. - // - // Given we only have ~40 items to be indexed right now, we are just saving - // entries one by one. - const object: DocsSearchItem = { - // The path to the page will be the identifier in Algolia. - objectID: frontmatter.id, - path: frontmatter.id, - title: frontmatter.title, - section: frontmatter.section, - // Once we add tags are added to pages, Algolia records - // will be updated with them, so we can enhance the search experience - tags: frontmatter.tags || [], - // Saving a content page, not an API endpoint - contentType: "document", - // Saving to the pages index - index: "pages", - }; - - await index.saveObject(object); - } catch (e) { - console.error(e); - } - } else { - console.info( - "Algolia configuration variables not present. Skipping indexing.", - ); - } + return resourcePath.replace(/\.mdx?$/, "").replace(/\/index$/, ""); } diff --git a/package.json b/package.json index 3da0302d4..a271ae75f 100644 --- a/package.json +++ b/package.json @@ -16,10 +16,11 @@ "generate-llms": "yarn run open-api-to-md && tsx scripts/generateApiMarkdown.ts && tsx scripts/generateLlmsTxt.ts", "generate-reference-md": "tsx scripts/generateApiMarkdown.ts", "index-apis": "tsx scripts/indexApisForSearch.ts", + "index-docs": "tsx scripts/indexDocsForSearch.ts", "open-api-to-md": "bash scripts/openApiToMd.sh", "split-specs": "tsx scripts/splitOpenApiSpec.ts", "predev": "yarn split-specs && yarn generate-llms", - "prebuild": "yarn split-specs && yarn generate-llms && yarn index-apis" + "prebuild": "yarn split-specs && yarn generate-llms && yarn index-docs && yarn index-apis" }, "dependencies": { "@algolia/autocomplete-js": "^1.6.3", diff --git a/pages/[...slug].tsx b/pages/[...slug].tsx index 00d485c78..c66cd4c52 100644 --- a/pages/[...slug].tsx +++ b/pages/[...slug].tsx @@ -14,7 +14,6 @@ import { CONTENT_DIR, DOCS_FILE_EXTENSIONS, makeIdFromPath, - generateAlgoliaIndex, } from "../lib/content.server"; import eventPayload from "../data/code/sources/eventPayload"; import datadogDashboardJson from "../content/integrations/extensions/datadog_dashboard.json"; @@ -96,9 +95,6 @@ export async function getStaticProps({ params: { slug } }) { // Extend frontmatter mdxSource.frontmatter.id = makeIdFromPath(slug.join(sep)); - // Index page in algolia - await generateAlgoliaIndex(mdxSource.frontmatter); - return { props: { source: mdxSource, sourcePath, typedocs } }; } diff --git a/scripts/indexDocsForSearch.ts b/scripts/indexDocsForSearch.ts new file mode 100644 index 000000000..e87856c26 --- /dev/null +++ b/scripts/indexDocsForSearch.ts @@ -0,0 +1,407 @@ +import fs from "fs"; +import path from "path"; +import { unified } from "unified"; +import remarkParse from "remark-parse"; +import remarkFrontmatter from "remark-frontmatter"; +import yaml from "yaml"; +import algoliasearch from "algoliasearch"; +import { loadEnvConfig } from "@next/env"; +import type { EnhancedDocsSearchItem } from "@/types"; + +// Load Next.js environment variables +const projectDir = process.cwd(); +loadEnvConfig(projectDir); + +const algoliaAppId = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID ?? ""; +const algoliaAdminApiKey = process.env.ALGOLIA_ADMIN_API_KEY ?? ""; +const algoliaPagesIndexName = process.env.NEXT_PUBLIC_ALGOLIA_INDEX_NAME ?? ""; + +const CONTENT_DIR = path.join(projectDir, "content"); +const DOCS_FILE_EXTENSIONS = [".mdx", ".md"]; + +// Maximum content length per record (in characters) +// Algolia recommends keeping records small for better performance +const MAX_CONTENT_LENGTH = 2000; + +// Keep count of indexed items +let pageCount = 0; +let headingCount = 0; + +interface Heading { + level: number; + title: string; + slug: string; + content: string; +} + +interface Frontmatter { + title: string; + description?: string; + tags?: string[]; + section: string; +} + +/** + * Recursively get all files in a directory with specific extensions + */ +function getAllFilesInDir( + directory: string, + files: string[] = [], + extensions?: string[], +): string[] { + fs.readdirSync(directory).forEach((file) => { + const subpath = path.join(directory, file); + if (fs.lstatSync(subpath).isDirectory()) { + getAllFilesInDir(subpath, files, extensions); + } else { + if (!extensions || extensions.includes(path.extname(subpath))) { + files.push(subpath); + } + } + }); + + return files; +} + +/** + * Parse frontmatter from markdown content using remark + */ +async function parseFrontmatter( + markdownContent: string, +): Promise { + const file = await unified() + .use(remarkParse) + .use(remarkFrontmatter, ["yaml"]) + .parse(markdownContent); + + const yamlNode = file.children.find( + (node): node is { type: "yaml"; value: string } => node.type === "yaml", + ); + if (!yamlNode) return null; + return yaml.parse(yamlNode.value); +} + +/** + * Create a URL-friendly slug from a heading title + */ +function slugify(text: string): string { + return text + .toLowerCase() + .replace(/[^\w\s-]/g, "") // Remove non-word characters except spaces and hyphens + .replace(/\s+/g, "-") // Replace spaces with hyphens + .replace(/-+/g, "-") // Replace multiple hyphens with single + .trim(); +} + +/** + * Remove frontmatter from markdown content + */ +function removeFrontmatter(content: string): string { + // Match YAML frontmatter at the start of the file + const frontmatterRegex = /^---[\s\S]*?---\n*/; + return content.replace(frontmatterRegex, ""); +} + +/** + * Extract plain text from markdown content + * Removes JSX components, imports, code blocks, and other non-text elements + */ +function extractTextContent(mdxContent: string): string { + let content = mdxContent; + + // Remove import statements + content = content.replace(/^import\s+.*$/gm, ""); + + // Remove export statements + content = content.replace(/^export\s+.*$/gm, ""); + + // Remove code blocks (fenced) + content = content.replace(/```[\s\S]*?```/g, ""); + + // Remove inline code + content = content.replace(/`[^`]+`/g, ""); + + // Remove JSX components (self-closing and with children) + content = content.replace(/<[A-Z][^>]*\/>/g, ""); // Self-closing like + content = content.replace(/<[A-Z][^>]*>[\s\S]*?<\/[A-Z][^>]*>/g, ""); // With children + + // Remove HTML-style components + content = content.replace(/<[a-z][^>]*>[\s\S]*?<\/[a-z][^>]*>/g, ""); + + // Remove remaining HTML/JSX tags + content = content.replace(/<[^>]+>/g, ""); + + // Remove markdown images (must come before links since images contain link syntax) + content = content.replace(/!\[([^\]]*)\]\([^)]+\)/g, ""); + + // Remove markdown links but keep the text + content = content.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1"); + + // Remove markdown emphasis markers + content = content.replace(/\*\*([^*]+)\*\*/g, "$1"); // Bold + content = content.replace(/\*([^*]+)\*/g, "$1"); // Italic + content = content.replace(/__([^_]+)__/g, "$1"); // Bold + content = content.replace(/_([^_]+)_/g, "$1"); // Italic + + // Remove heading markers + content = content.replace(/^#{1,6}\s+/gm, ""); + + // Remove horizontal rules + content = content.replace(/^[-*_]{3,}$/gm, ""); + + // Remove list markers + content = content.replace(/^\s*[-*+]\s+/gm, ""); + content = content.replace(/^\s*\d+\.\s+/gm, ""); + + // Remove blockquote markers + content = content.replace(/^\s*>\s*/gm, ""); + + // Normalize whitespace + content = content.replace(/\n{3,}/g, "\n\n"); // Multiple newlines to double + content = content.replace(/[ \t]+/g, " "); // Multiple spaces to single + + return content.trim(); +} + +/** + * Extract headings with their content from markdown + */ +function extractHeadings(mdxContent: string): Heading[] { + // Remove frontmatter first + const contentWithoutFrontmatter = removeFrontmatter(mdxContent); + + const headingRegex = /^(#{2,3})\s+(.+)$/gm; + const headings: Heading[] = []; + let match; + + const matches: Array<{ index: number; level: number; title: string }> = []; + + while ((match = headingRegex.exec(contentWithoutFrontmatter)) !== null) { + matches.push({ + index: match.index, + level: match[1].length, + title: match[2].trim(), + }); + } + + // Extract content for each heading + for (let i = 0; i < matches.length; i++) { + const current = matches[i]; + const next = matches[i + 1]; + + const contentStart = + current.index + `${"#".repeat(current.level)} ${current.title}`.length; + const contentEnd = next ? next.index : contentWithoutFrontmatter.length; + const rawContent = contentWithoutFrontmatter.slice( + contentStart, + contentEnd, + ); + + const cleanContent = extractTextContent(rawContent); + + // Only include headings with meaningful content + if (cleanContent.length > 20) { + headings.push({ + level: current.level, + title: current.title, + slug: slugify(current.title), + content: cleanContent.slice(0, MAX_CONTENT_LENGTH), + }); + } + } + + return headings; +} + +/** + * Get the intro content (content before the first heading) + */ +function getIntroContent(mdxContent: string): string { + const contentWithoutFrontmatter = removeFrontmatter(mdxContent); + + // Find the first H2 or H3 heading + const firstHeadingMatch = contentWithoutFrontmatter.match(/^#{2,3}\s+/m); + + if (firstHeadingMatch && firstHeadingMatch.index !== undefined) { + const introRaw = contentWithoutFrontmatter.slice( + 0, + firstHeadingMatch.index, + ); + return extractTextContent(introRaw).slice(0, MAX_CONTENT_LENGTH); + } + + // No headings found, use all content + return extractTextContent(contentWithoutFrontmatter).slice( + 0, + MAX_CONTENT_LENGTH, + ); +} + +/** + * Convert file path to URL path + */ +function filePathToUrlPath(filePath: string): string { + return filePath + .replace(CONTENT_DIR, "") + .replace(/\.mdx?$/, "") + .replace(/\/index$/, "") // Only remove /index at end of path + .replace(/^\//, ""); // Remove leading slash for objectID +} + +/** + * Queue of items to save to Algolia + */ +const itemsToSave: EnhancedDocsSearchItem[] = []; + +async function queueItem(item: EnhancedDocsSearchItem) { + // Validate path doesn't start with / + if (item.path.startsWith("/")) { + console.error(`Path may not start with "/". Violating path: ${item.path}`); + return; + } + + console.log( + `Indexing ${item.isPageLevel ? "page" : "heading"}: ${item.title} -> ${ + item.path + }`, + ); + itemsToSave.push(item); +} + +/** + * Process a single MDX file and create search records + */ +async function processFile(filePath: string): Promise { + // Skip special directories + if ( + filePath.includes("/__mapi-reference/") || + filePath.includes("/__api-reference/") || + filePath.includes("/__cli/") + ) { + return; + } + + const content = fs.readFileSync(filePath, "utf-8"); + const frontmatter = await parseFrontmatter(content); + + if (!frontmatter || !frontmatter.title || !frontmatter.section) { + console.warn(`Skipping ${filePath}: missing required frontmatter`); + return; + } + + const urlPath = filePathToUrlPath(filePath); + + // Create page-level record + const introContent = getIntroContent(content); + const pageRecord: EnhancedDocsSearchItem = { + objectID: `page-${urlPath}`, + path: urlPath, + title: frontmatter.title, + pageTitle: frontmatter.title, + description: frontmatter.description, + content: introContent, + section: frontmatter.section, + tags: frontmatter.tags || [], + headingLevel: 0, + contentType: "document", + index: "pages", + isPageLevel: true, + }; + await queueItem(pageRecord); + pageCount++; + + // Extract and create heading-level records + const headings = extractHeadings(content); + for (const heading of headings) { + const headingPath = `${urlPath}#${heading.slug}`; + const headingRecord: EnhancedDocsSearchItem = { + objectID: `heading-${headingPath}`, + path: headingPath, + title: heading.title, + pageTitle: frontmatter.title, + content: heading.content, + section: frontmatter.section, + tags: frontmatter.tags || [], + headingLevel: heading.level, + contentType: "document", + index: "pages", + isPageLevel: false, + }; + await queueItem(headingRecord); + headingCount++; + } +} + +/** + * Main entry point + */ +async function main() { + console.log("๐Ÿ” Starting docs search indexing...\n"); + + let skipIndexing = false; + + // Check for required environment variables + if (!algoliaAppId || !algoliaAdminApiKey || !algoliaPagesIndexName) { + const missing: string[] = []; + if (!algoliaAppId) missing.push("NEXT_PUBLIC_ALGOLIA_APP_ID"); + if (!algoliaAdminApiKey) missing.push("ALGOLIA_ADMIN_API_KEY"); + if (!algoliaPagesIndexName) missing.push("NEXT_PUBLIC_ALGOLIA_INDEX_NAME"); + + console.warn( + "Missing Algolia environment variables. Continuing with script but skipping actual indexing.\n\nMissing: " + + missing.join(", "), + ); + skipIndexing = true; + } + + // Get all MDX/MD files + const files = getAllFilesInDir(CONTENT_DIR, [], DOCS_FILE_EXTENSIONS); + console.log(`Found ${files.length} content files to process\n`); + + // Process each file + for (const file of files) { + try { + await processFile(file); + } catch (error) { + console.error(`Error processing ${file}:`, error); + } + } + + console.log("\n๐Ÿ“Š Indexing summary:"); + console.log(` Pages indexed: ${pageCount}`); + console.log(` Headings indexed: ${headingCount}`); + console.log(` Total records: ${itemsToSave.length}`); + + // Save to Algolia + if (!skipIndexing && itemsToSave.length > 0) { + console.log("\n๐Ÿ“ค Uploading to Algolia..."); + + const client = algoliasearch(algoliaAppId, algoliaAdminApiKey); + const index = client.initIndex(algoliaPagesIndexName); + + // Save objects in batches (Algolia recommends batches of 1000) + const BATCH_SIZE = 1000; + for (let i = 0; i < itemsToSave.length; i += BATCH_SIZE) { + const batch = itemsToSave.slice(i, i + BATCH_SIZE); + await index.saveObjects(batch); + console.log( + ` Saved batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil( + itemsToSave.length / BATCH_SIZE, + )}`, + ); + } + + console.log("\nโœ… Successfully indexed docs for search!"); + } else if (skipIndexing) { + console.log( + "\nโš ๏ธ Completed processing, but skipped Algolia upload due to missing environment variables.", + ); + } + + process.exit(0); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/types.ts b/types.ts index f3cb89ee6..71bc5ed0f 100644 --- a/types.ts +++ b/types.ts @@ -47,6 +47,24 @@ export type DocsSearchItem = { index: "pages" | "endpoints"; }; +// Enhanced search item type for improved Algolia indexing +// This extends the basic DocsSearchItem with content and heading information +export type EnhancedDocsSearchItem = { + objectID: string; // Unique ID (page-path or page-path#heading-slug) + path: string; // URL path (with optional anchor) + title: string; // Page title OR heading title + pageTitle: string; // Always the parent page title + description?: string; // From frontmatter (page-level only) + content: string; // Text content (truncated ~300-500 words) + section: string; // Top-level section (Concepts, Getting Started, etc.) + tags: string[]; // Tags from frontmatter + headingLevel: number; // 0 for page, 2 for H2, 3 for H3 + contentType: "document" | "api-reference"; + index: "pages" | "endpoints"; + // Ranking fields + isPageLevel: boolean; // True if this is a page-level record (not a heading) +}; + export type EndpointSearchItem = DocsSearchItem & { method: string; endpoint: string;