-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmetadata.ts
More file actions
64 lines (55 loc) · 1.95 KB
/
metadata.ts
File metadata and controls
64 lines (55 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import * as cheerio from "cheerio";
import { execFile } from "node:child_process";
import { promisify } from "node:util";
const execFileAsync = promisify(execFile);
const PROXY_URL = process.env["PROXY_URL"];
export interface DescriptionResult {
description: string | null;
aiDescription?: string | null;
}
export async function getDescription(url: string): Promise<DescriptionResult> {
try {
const proxyArgs = PROXY_URL ? ["-x", PROXY_URL] : [];
// https://perishablepress.com/list-all-user-agents-top-search-engines/
let userAgent =
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)";
if (url.includes("tiktok.com")) {
userAgent = "facebookexternalhit/1.1";
}
const userAgentArgs = ["-A", userAgent];
const html = await execFileAsync(
"curl",
["-s", ...proxyArgs, ...userAgentArgs, url],
{
maxBuffer: 1024 * 1024 * 10,
},
);
const $ = cheerio.load(html.stdout);
// For Instagram, extract the caption from og:title since the description
// meta tag contains AI-generated summaries instead of the actual caption
if (url.includes("instagram.com")) {
const ogTitle = $("meta[property='og:title']").attr("content");
// og:title format: "Username on Instagram: "caption"" (note: uses curly quotes)
const match = ogTitle?.match(/on Instagram: [""](.*)[""]$/s);
const aiDescription =
$("meta[name='description']").attr("content") || null;
if (match) {
return { description: match[1] || null, aiDescription };
}
return { description: null, aiDescription };
}
// For TikTok, extract from og:description since there's no standard description meta tag
if (url.includes("tiktok.com")) {
return {
description:
$("meta[property='og:description']").attr("content") || null,
};
}
return {
description: $("meta[name='description']").attr("content") || null,
};
} catch (e) {
console.warn(`Failed to get description for ${url}:`, e);
return { description: null };
}
}