diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..5a7704c --- /dev/null +++ b/.prettierrc @@ -0,0 +1,6 @@ +{ + "singleQuote": false, + "trailingComma": "es5", + "tabWidth": 2, + "semi": true +} diff --git a/package-lock.json b/package-lock.json index b2fdc11..228fee6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,15 +1,17 @@ { "name": "@notionpresso/api-sdk", - "version": "0.0.1", + "version": "0.0.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@notionpresso/api-sdk", - "version": "0.0.1", + "version": "0.0.2", "license": "ISC", "dependencies": { - "@notionhq/client": "^2.2.3" + "@notionhq/client": "^2.2.3", + "@types/cheerio": "^0.22.35", + "cheerio": "1.0.0-rc.12" }, "devDependencies": { "cp": "^0.2.0", @@ -29,6 +31,14 @@ "node": ">=12" } }, + "node_modules/@types/cheerio": { + "version": "0.22.35", + "resolved": "https://registry.npmjs.org/@types/cheerio/-/cheerio-0.22.35.tgz", + "integrity": "sha512-yD57BchKRvTV+JD53UZ6PD8KWY5g5rvvMLRnZR3EQBCZXiDT/HR+pKpMzFGlWNhFrXlo7VPZXtKvIEwZkAWOIA==", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/node": { "version": "22.7.7", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.7.tgz", @@ -54,6 +64,47 @@ "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", "license": "MIT" }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, + "node_modules/cheerio": { + "version": "1.0.0-rc.12", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", + "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "htmlparser2": "^8.0.1", + "parse5": "^7.0.0", + "parse5-htmlparser2-tree-adapter": "^7.0.0" + }, + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -73,6 +124,32 @@ "dev": true, "license": "MIT" }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/delayed-stream": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", @@ -82,6 +159,68 @@ "node": ">=0.4.0" } }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/form-data": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.1.tgz", @@ -96,6 +235,24 @@ "node": ">= 6" } }, + "node_modules/htmlparser2": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", + "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "entities": "^4.4.0" + } + }, "node_modules/mime-db": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", @@ -137,6 +294,40 @@ } } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, + "node_modules/parse5": { + "version": "7.2.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", + "integrity": "sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==", + "dependencies": { + "entities": "^4.5.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/tr46": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", diff --git a/package.json b/package.json index 6b6904b..fc95b29 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,10 @@ { "name": "@notionpresso/api-sdk", - "version": "0.0.1", + "version": "0.0.2", "description": "", + "main": "./package/index.js", + "module": "./package/index.js", + "types": "./package/index.d.ts", "scripts": { "test": "echo \"Error: no test specified\" && exit 1", "sync": "tsc --build .", @@ -23,10 +26,9 @@ "typescript": "^4.9.4" }, "dependencies": { - "@notionhq/client": "^2.2.3" + "@notionhq/client": "^2.2.3", + "@types/cheerio": "^0.22.35", + "cheerio": "1.0.0-rc.12" }, - "type": "module", - "exports": { - ".": "./src/index.ts" - } + "type": "module" } diff --git a/src/bookmark/fetch.ts b/src/bookmark/fetch.ts new file mode 100644 index 0000000..8ee0719 --- /dev/null +++ b/src/bookmark/fetch.ts @@ -0,0 +1,17 @@ +export async function fetchUrl(url: string): Promise { + try { + const response = await fetch(url, { + headers: { + "User-Agent": "Mozilla/5.0 (compatible; NotionDump/1.0)", + }, + }); + + if (!response.ok) { + throw new Error(`Failed to fetch URL: ${response.status}`); + } + + return await response.text(); + } catch (error) { + throw new Error(`Error fetching URL: ${(error as Error).message}`); + } +} diff --git a/src/bookmark/index.ts b/src/bookmark/index.ts new file mode 100644 index 0000000..0e7c385 --- /dev/null +++ b/src/bookmark/index.ts @@ -0,0 +1,6 @@ +export * from "./preprocessors.js"; +export * from "./types.js"; +export * from "./metadata.js"; +export * from "./fetch.js"; +export * from "./transform.js"; +export * from "./utils.js"; diff --git a/src/bookmark/metadata.ts b/src/bookmark/metadata.ts new file mode 100644 index 0000000..4dcfb53 --- /dev/null +++ b/src/bookmark/metadata.ts @@ -0,0 +1,121 @@ +import * as cheerio from "cheerio"; + +function extractDomainFromUrl(url: string): string { + try { + return new URL(url).hostname; + } catch { + throw new Error(`Invalid URL: ${url}`); + } +} + +function extractTitleFromHtml(cheerioHTML: cheerio.Root): string | undefined { + const title = cheerioHTML("title").text().trim(); + return title || undefined; +} + +function extractMetaTagsFromHtml( + cheerioHTML: cheerio.Root +): Record { + const metaTags: Record = {}; + + cheerioHTML("meta").each((_, element) => { + const meta = cheerioHTML(element); + const name = meta.attr("name") || meta.attr("property"); + const content = meta.attr("content"); + + if (name && content) { + const key = name.replace(/^(og:|twitter:)/, ""); + metaTags[key] = content; + } + }); + + return metaTags; +} + +function extractFaviconFromHtml( + cheerioHTML: cheerio.Root, + url: string +): string { + try { + const baseUrl = new URL(url); + const origin = baseUrl.origin; + + const iconLinks = cheerioHTML( + 'link[rel="icon"], link[rel="shortcut icon"], link[rel="apple-touch-icon"]' + ); + if (iconLinks.length > 0) { + const faviconUrl = cheerioHTML(iconLinks[0]).attr("href"); + if (faviconUrl) { + if (faviconUrl.startsWith("/")) { + return `${origin}${faviconUrl}`; + } else if (!faviconUrl.startsWith("http")) { + return `${origin}/${faviconUrl}`; + } + return faviconUrl; + } + } + + return `${origin}/favicon.ico`; + } catch (error) { + throw new Error(`Error extracting favicon: ${(error as Error).message}`); + } +} + +function findLargeImageInHtml(cheerioHTML: cheerio.Root): string | undefined { + const largeImages = cheerioHTML("img").filter((_, img) => { + const width = parseInt(cheerioHTML(img).attr("width") || "0", 10); + const height = parseInt(cheerioHTML(img).attr("height") || "0", 10); + return width > 200 && height > 200; + }); + + if (largeImages.length > 0) { + return cheerioHTML(largeImages[0]).attr("src") || undefined; + } + + return undefined; +} + +function extractFirstParagraphFromHtml( + cheerioHTML: cheerio.Root +): string | undefined { + const firstParagraph = cheerioHTML("p").first().text().trim(); + return firstParagraph || undefined; +} + +export async function extractMetadata( + html: string, + url?: string +): Promise> { + const cheerioHTML = cheerio.load(html); + const metadata: Record = {}; + + if (url) { + metadata.url = url; + metadata.domain = extractDomainFromUrl(url); + metadata.favicon = extractFaviconFromHtml(cheerioHTML, url); + } + + const title = extractTitleFromHtml(cheerioHTML); + if (title) { + metadata.title = title; + } + + const metaTags = extractMetaTagsFromHtml(cheerioHTML); + Object.assign(metadata, metaTags); + + if (!metadata.image) { + const largeImage = findLargeImageInHtml(cheerioHTML); + if (largeImage) { + metadata.image = largeImage; + } + } + + if (!metadata.description) { + const firstParagraph = extractFirstParagraphFromHtml(cheerioHTML); + if (firstParagraph) { + metadata.description = firstParagraph; + } + } + + return metadata; +} diff --git a/src/bookmark/preprocessors.ts b/src/bookmark/preprocessors.ts new file mode 100644 index 0000000..cd9ac1b --- /dev/null +++ b/src/bookmark/preprocessors.ts @@ -0,0 +1,87 @@ +import { cleanWhitespace, extractDomain, stripHtml } from "./utils.js"; +import { transformBookmarkBlock } from "./transform.js"; +import { Block } from "../index.js"; + +interface PreprocessorFn { + (value: string): string; +} + +export const DEFAULT_BOOKMARK_FIELDS = [ + "title", + "url", + "description", + "favicon", + "image", +] as const; + +export const defaultPreprocessors: Record = { + title: (value) => cleanWhitespace(stripHtml(value)), + description: (value) => cleanWhitespace(stripHtml(value)), + image: (value) => value.trim(), + favicon: (value) => value.trim(), + site_name: (value) => cleanWhitespace(value), + url: (value) => value.trim(), + domain: extractDomain, + author: cleanWhitespace, +}; + +export function applyPreprocessors( + metadata: Record, + preprocessors: Record = defaultPreprocessors, + fields?: string[] +): Record { + const processed: Record = {}; + + const keysToProcess = fields + ? Object.keys(metadata).filter((key) => fields.includes(key)) + : Object.keys(metadata).filter((key) => + DEFAULT_BOOKMARK_FIELDS.includes( + key as (typeof DEFAULT_BOOKMARK_FIELDS)[number] + ) + ); + + for (const key of keysToProcess) { + const value = metadata[key]; + if (preprocessors[key]) { + processed[key] = preprocessors[key](value); + } else { + processed[key] = value; + } + } + + return processed; +} + +export async function processBlocks( + blocks: Block[], + options: { + meta?: boolean; + fields?: string[]; + } = {} +): Promise { + const preprocessors = { ...defaultPreprocessors }; + + async function processBlocksRecursively(blocks: Block[]): Promise { + const results = []; + + for (const block of blocks) { + const processedBlock = await transformBookmarkBlock(block, { + meta: options.meta, + preprocessors, + fields: options.fields, + }); + + if (processedBlock.blocks) { + processedBlock.blocks = await processBlocksRecursively( + processedBlock.blocks + ); + } + + results.push(processedBlock); + } + + return results; + } + + return processBlocksRecursively(blocks); +} diff --git a/src/bookmark/transform.ts b/src/bookmark/transform.ts new file mode 100644 index 0000000..4b50b47 --- /dev/null +++ b/src/bookmark/transform.ts @@ -0,0 +1,90 @@ +import type { NotionpressoBookmarkBlock } from "./types.js"; +import { + DEFAULT_BOOKMARK_FIELDS, + applyPreprocessors, +} from "./preprocessors.js"; +import { fetchUrl } from "./fetch.js"; +import { extractMetadata } from "./metadata.js"; +import type { Block } from "../index.js"; + +interface OriginalBookmark { + url: string; + caption: Array; + [key: string]: unknown; +} + +interface PreprocessorFn { + (value: string): string; +} + +function createBookmarkBlock( + originalBlock: Block, + metadata: Record, + caption: Array, + error?: string +): NotionpressoBookmarkBlock { + const newBlock = JSON.parse( + JSON.stringify(originalBlock) + ) as NotionpressoBookmarkBlock; + newBlock.type = "notionpresso_bookmark"; + + newBlock.notionpresso_bookmark = { + metadata, + caption, + }; + + if (error) { + newBlock.notionpresso_bookmark.error = error; + } + + return newBlock; +} + +export function setBasicMetadata( + originalBookmark: OriginalBookmark +): Record { + return Object.fromEntries( + DEFAULT_BOOKMARK_FIELDS.filter( + (field) => originalBookmark[field] !== undefined + ).map((field) => [field, originalBookmark[field] as string]) + ); +} + +export async function transformBookmarkBlock( + block: Block, + options: { + meta?: boolean; + preprocessors?: Record; + fields?: string[]; + } = {} +): Promise { + if (block.type !== "bookmark") { + return block; + } + + const originalBookmark = { ...block.bookmark } as OriginalBookmark; + const basicMetadata = setBasicMetadata(originalBookmark); + + if (!options.meta || !originalBookmark?.url) { + return createBookmarkBlock(block, basicMetadata, originalBookmark.caption); + } + + try { + const html = await fetchUrl(originalBookmark.url); + const metadata = await extractMetadata(html, originalBookmark.url); + const processed = applyPreprocessors( + metadata, + options.preprocessors, + options.fields + ); + + return createBookmarkBlock(block, processed, originalBookmark.caption); + } catch (error) { + return createBookmarkBlock( + block, + basicMetadata, + originalBookmark.caption, + (error as Error).message + ); + } +} diff --git a/src/bookmark/types.ts b/src/bookmark/types.ts new file mode 100644 index 0000000..c18a0a6 --- /dev/null +++ b/src/bookmark/types.ts @@ -0,0 +1,11 @@ +import { Block } from "./../index"; + +export interface NotionpressoBookmarkBlock { + type: "notionpresso_bookmark"; + notionpresso_bookmark: { + metadata: Record; + caption?: any[]; + error?: string; + }; + blocks?: Block[]; +} diff --git a/src/bookmark/utils.ts b/src/bookmark/utils.ts new file mode 100644 index 0000000..6a3af5e --- /dev/null +++ b/src/bookmark/utils.ts @@ -0,0 +1,13 @@ +export const cleanWhitespace = (text: string): string => + text.replace(/\s+/g, " ").trim(); + +export const stripHtml = (html: string): string => html.replace(/<[^>]*>/g, ""); + +export const extractDomain = (url: string): string => { + try { + const domain = new URL(url).hostname; + return domain.replace(/^www\./, ""); + } catch { + return url; + } +}; diff --git a/src/index.ts b/src/index.ts index 0595fd3..01094d1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,5 @@ -import {Client as _Client} from "@notionhq/client"; -import type {ClientOptions} from "@notionhq/client/build/src/Client"; +import { Client as _Client } from "@notionhq/client"; +import type { ClientOptions } from "@notionhq/client/build/src/Client"; import { BlockObjectResponse, PageObjectResponse, @@ -7,6 +7,8 @@ import { QueryDatabaseResponse, } from "@notionhq/client/build/src/api-endpoints"; +import { type NotionpressoBookmarkBlock } from "./bookmark"; + export class Client extends _Client { constructor(options: ClientOptions = {}) { super(options); @@ -35,7 +37,6 @@ export class Client extends _Client { const result = (await Promise.all( (blocks as BlockObjectResponse[]).map(async (block) => { if (block.has_children) { - const blockId = block.type === "synced_block" && block.synced_block.synced_from != null @@ -62,11 +63,18 @@ export class Client extends _Client { return { ...page, blocks }; } - async fetchPageListFromDatabase(params: QueryDatabaseParameters): Promise { + async fetchPageListFromDatabase( + params: QueryDatabaseParameters + ): Promise { const response = await this.databases.query(params); const result = [...response.results]; - if (response.has_more) { - const nextParams = {...params, database_id: response.next_cursor}; + if (response.has_more && response.next_cursor) { + const { database_id, ...restParams } = params; + const nextParams = { + database_id: database_id as string, + ...restParams, + start_cursor: response.next_cursor, + }; const nextResult = await this.fetchPageListFromDatabase(nextParams); result.push(...nextResult); } @@ -75,8 +83,11 @@ export class Client extends _Client { } } -export type Block = BlockObjectResponse & { blocks: Block[] }; +export type Block = (BlockObjectResponse | NotionpressoBookmarkBlock) & { + blocks: Block[]; +}; export type ContentfulPage = PageObjectResponse & { blocks: Block[] }; -export type QueryDatabaseResults = QueryDatabaseResponse['results']; +export type QueryDatabaseResults = QueryDatabaseResponse["results"]; export { ClientOptions }; +export * as bookmarkPreprocessors from "./bookmark/preprocessors.js"; export default Client;