diff --git a/lib/__tests__/youtube-transcript-provider.test.ts b/lib/__tests__/youtube-transcript-provider.test.ts index f256c58..66ded7d 100644 --- a/lib/__tests__/youtube-transcript-provider.test.ts +++ b/lib/__tests__/youtube-transcript-provider.test.ts @@ -2,11 +2,8 @@ import test from 'node:test'; import assert from 'node:assert/strict'; import { - buildCaptionTrackCandidates, - extractCaptionTracksFromWatchHtml, fetchYouTubeTranscript, - transformCaptionJsonToSegments, - transformCaptionXmlToSegments, + TranscriptProviderError, } from '../youtube-transcript-provider'; function withMockFetch( @@ -21,124 +18,177 @@ function withMockFetch( }); } -test('extractCaptionTracksFromWatchHtml returns caption tracks from player response', () => { - const html = ` - - - - - - `; - - const tracks = extractCaptionTracksFromWatchHtml(html); - - assert.deepEqual(tracks, [ - { - baseUrl: 'https://example.com/en', - languageCode: 'en', - kind: undefined, - name: 'English', - }, - { - baseUrl: 'https://example.com/fr-auto', - languageCode: 'fr', - kind: 'asr', - name: 'Francais', +// Minimal YouTube watch page HTML with INNERTUBE_API_KEY embedded +// (our provider scrapes this before calling InnerTube) +const FAKE_WATCH_PAGE = ` + +`; + +test('fetchYouTubeTranscript returns transcript when Android client succeeds', async () => { + await withMockFetch( + async (input, init) => { + const url = typeof input === 'string' ? input : input.toString(); + + // Page scrape request + if (url.includes('youtube.com/watch')) { + return new Response(FAKE_WATCH_PAGE); + } + + // InnerTube player request — return caption tracks + if (url.includes('/youtubei/v1/player')) { + return new Response(JSON.stringify({ + playabilityStatus: { status: 'OK' }, + captions: { + playerCaptionsTracklistRenderer: { + captionTracks: [ + { + baseUrl: 'https://captions.test/en', + languageCode: 'en', + name: { simpleText: 'English' }, + }, + { + baseUrl: 'https://captions.test/fr', + languageCode: 'fr', + name: { simpleText: 'Francais' }, + kind: 'asr', + }, + ], + }, + }, + })); + } + + // Caption track fetch — return XML with

format (milliseconds) + if (url.startsWith('https://captions.test/en')) { + return new Response(` +

hello & welcome

+

'quoted'

+ `); + } + + throw new Error(`Unexpected fetch URL: ${url}`); }, - ]); -}); + async () => { + const result = await fetchYouTubeTranscript('video123'); -test('buildCaptionTrackCandidates prioritizes requested language and manual tracks', () => { - const tracks = [ - { baseUrl: 'https://example.com/en-auto', languageCode: 'en', kind: 'asr', name: 'English auto' }, - { baseUrl: 'https://example.com/fr-auto', languageCode: 'fr', kind: 'asr', name: 'Francais auto' }, - { baseUrl: 'https://example.com/fr', languageCode: 'fr', kind: undefined, name: 'Francais' }, - { baseUrl: 'https://example.com/de', languageCode: 'de', kind: undefined, name: 'Deutsch' }, - ]; - - const candidates = buildCaptionTrackCandidates(tracks, 'fr'); - - assert.deepEqual( - candidates.map((track) => track.baseUrl), - [ - 'https://example.com/fr', - 'https://example.com/fr-auto', - 'https://example.com/en-auto', - 'https://example.com/de', - ] + assert.ok(result, 'Should return a result'); + assert.equal(result.language, 'en'); + assert.deepEqual(result.availableLanguages, ['en', 'fr']); + assert.equal(result.segments.length, 2); + assert.equal(result.segments[0].text, 'hello & welcome'); + assert.equal(result.segments[0].start, 0.42); + assert.equal(result.segments[0].duration, 4.2); + assert.equal(result.segments[1].text, "'quoted'"); + } ); }); -test('transformCaptionJsonToSegments decodes entities and ignores empty events', () => { - const segments = transformCaptionJsonToSegments({ - events: [ - { - tStartMs: 1500, - dDurationMs: 2500, - segs: [{ utf8: 'Hello & ' }, { utf8: 'welcome' }], - }, - { - tStartMs: 4000, - dDurationMs: 1000, - }, - { - tStartMs: 5000, - dDurationMs: 1250, - segs: [{ utf8: ''quoted'' }], - }, - ], - }); +test('fetchYouTubeTranscript prefers requested language', async () => { + await withMockFetch( + async (input) => { + const url = typeof input === 'string' ? input : input.toString(); - assert.deepEqual(segments, [ - { - text: 'Hello & welcome', - start: 1.5, - duration: 2.5, - }, - { - text: "'quoted'", - start: 5, - duration: 1.25, + if (url.includes('youtube.com/watch')) { + return new Response(FAKE_WATCH_PAGE); + } + + if (url.includes('/youtubei/v1/player')) { + return new Response(JSON.stringify({ + playabilityStatus: { status: 'OK' }, + captions: { + playerCaptionsTracklistRenderer: { + captionTracks: [ + { + baseUrl: 'https://captions.test/en', + languageCode: 'en', + name: { simpleText: 'English' }, + }, + { + baseUrl: 'https://captions.test/fr', + languageCode: 'fr', + name: { simpleText: 'Francais' }, + }, + ], + }, + }, + })); + } + + // Should request French since we asked for it + if (url.startsWith('https://captions.test/fr')) { + return new Response(` +

bonjour

+
`); + } + + if (url.startsWith('https://captions.test/en')) { + return new Response(` +

hello

+
`); + } + + throw new Error(`Unexpected fetch URL: ${url}`); }, - ]); + async () => { + const result = await fetchYouTubeTranscript('video123', 'fr'); + + assert.ok(result); + assert.equal(result.language, 'fr'); + assert.equal(result.segments[0].text, 'bonjour'); + } + ); }); -test('transformCaptionXmlToSegments parses youtube timedtext xml', () => { - const xml = `hello & welcome'quoted'`; +test('fetchYouTubeTranscript returns null when video has no captions', async () => { + await withMockFetch( + async (input) => { + const url = typeof input === 'string' ? input : input.toString(); - const segments = transformCaptionXmlToSegments(xml); + if (url.includes('youtube.com/watch')) { + return new Response(FAKE_WATCH_PAGE); + } - assert.deepEqual(segments, [ - { - text: 'hello & welcome', - start: 0.42, - duration: 4.2, - }, - { - text: "'quoted'", - start: 5.1, - duration: 1.5, + if (url.includes('/youtubei/v1/player')) { + // No captions object at all + return new Response(JSON.stringify({ + playabilityStatus: { status: 'OK' }, + })); + } + + throw new Error(`Unexpected fetch URL: ${url}`); }, - ]); + async () => { + const result = await fetchYouTubeTranscript('video123'); + assert.equal(result, null); + } + ); }); -test('fetchYouTubeTranscript preserves an explicitly requested language', async () => { +test('fetchYouTubeTranscript tries next client when one is rate-limited', async () => { + let innerTubeCallCount = 0; + await withMockFetch( async (input) => { const url = typeof input === 'string' ? input : input.toString(); + if (url.includes('youtube.com/watch')) { + return new Response(FAKE_WATCH_PAGE); + } + if (url.includes('/youtubei/v1/player')) { + innerTubeCallCount++; + // First call (Android) returns 429 rate limit + if (innerTubeCallCount === 1) { + return new Response('Too Many Requests', { status: 429 }); + } + // Second call (Web) succeeds return new Response(JSON.stringify({ + playabilityStatus: { status: 'OK' }, captions: { playerCaptionsTracklistRenderer: { captionTracks: [ - { - baseUrl: 'https://captions.test/fr', - languageCode: 'fr', - name: { simpleText: 'Francais' }, - }, { baseUrl: 'https://captions.test/en', languageCode: 'en', @@ -150,39 +200,37 @@ test('fetchYouTubeTranscript preserves an explicitly requested language', async })); } - if (url === 'https://captions.test/fr') { - return new Response('bonjour'); - } - - if (url === 'https://captions.test/en') { - return new Response('hello'); + if (url.startsWith('https://captions.test/en')) { + return new Response(` +

hello from fallback

+
`); } throw new Error(`Unexpected fetch URL: ${url}`); }, async () => { - const result = await fetchYouTubeTranscript('video123', 'fr', 1200); - - assert.equal(result?.language, 'fr'); - assert.deepEqual(result?.availableLanguages, ['fr', 'en']); - assert.deepEqual(result?.segments, [ - { - text: 'bonjour', - start: 0, - duration: 1, - }, - ]); + const result = await fetchYouTubeTranscript('video123'); + + assert.ok(result, 'Should succeed via fallback client'); + assert.equal(result.segments[0].text, 'hello from fallback'); + // Should have tried at least 2 InnerTube calls (Android failed, Web succeeded) + assert.ok(innerTubeCallCount >= 2, `Expected >= 2 InnerTube calls, got ${innerTubeCallCount}`); } ); }); -test('fetchYouTubeTranscript throws when caption tracks exist but all fetches fail', async () => { +test('fetchYouTubeTranscript parses legacy XML format', async () => { await withMockFetch( async (input) => { const url = typeof input === 'string' ? input : input.toString(); + if (url.includes('youtube.com/watch')) { + return new Response(FAKE_WATCH_PAGE); + } + if (url.includes('/youtubei/v1/player')) { return new Response(JSON.stringify({ + playabilityStatus: { status: 'OK' }, captions: { playerCaptionsTracklistRenderer: { captionTracks: [ @@ -197,14 +245,25 @@ test('fetchYouTubeTranscript throws when caption tracks exist but all fetches fa })); } - if (url === 'https://captions.test/en') { - return new Response('', { status: 500 }); + // Return legacy XML format (seconds, tags) + if (url.startsWith('https://captions.test/en')) { + return new Response(` + + hello & welcome + goodbye + `); } throw new Error(`Unexpected fetch URL: ${url}`); }, async () => { - await assert.rejects(() => fetchYouTubeTranscript('video123')); + const result = await fetchYouTubeTranscript('video123'); + + assert.ok(result); + assert.equal(result.segments.length, 2); + assert.equal(result.segments[0].text, 'hello & welcome'); + assert.equal(result.segments[0].start, 0.42); + assert.equal(result.segments[0].duration, 4.2); } ); }); diff --git a/lib/youtube-transcript-provider.ts b/lib/youtube-transcript-provider.ts index 2a75142..018d140 100644 --- a/lib/youtube-transcript-provider.ts +++ b/lib/youtube-transcript-provider.ts @@ -1,72 +1,101 @@ -interface CaptionTrackName { - simpleText?: string; - runs?: Array<{ text?: string }>; -} - -interface CaptionTrackRenderer { - baseUrl?: string; - languageCode?: string; - kind?: string; - name?: CaptionTrackName; -} - -interface CaptionTrackListRenderer { - captionTracks?: CaptionTrackRenderer[]; -} - -interface PlayerResponse { - captions?: { - playerCaptionsTracklistRenderer?: CaptionTrackListRenderer; - }; -} - -interface CaptionEvent { - tStartMs?: number; - dDurationMs?: number; - segs?: Array<{ utf8?: string }>; -} - -export interface CaptionTrack { - baseUrl: string; - languageCode: string; - kind?: string; - name: string; -} - -export interface CaptionJsonResponse { - events?: CaptionEvent[]; -} - +/** + * YouTube Transcript Provider — fetches captions directly from YouTube + * without needing a paid API key. + * + * HOW IT WORKS: + * 1. Scrapes the YouTube watch page to extract YouTube's own internal API key + * (this is a public key embedded in every YouTube page, not a personal key) + * 2. Uses that key to call YouTube's InnerTube Player API, pretending to be + * a legitimate YouTube client (Android app, web browser, or iOS app) + * 3. The API response includes URLs to download the actual caption tracks + * 4. Downloads and parses the caption XML into transcript segments + * + * WHY MULTIPLE CLIENTS: + * YouTube actively blocks automated requests from server IPs. Different client + * identities have different bot-detection thresholds. If one gets blocked, + * we try the next. Think of it as having three disguises. + * + * FALLBACK CHAIN: Android → Web → iOS + * + * Based on the approach from github.com/JimLiu/baoyu-skills + */ + +// ─── Types ────────────────────────────────────────────────────────────────── + +/** Result returned to the transcript route — matches the existing interface */ export interface TranscriptFetchResult { segments: { text: string; start: number; duration: number }[]; language?: string; availableLanguages: string[]; } +/** What went wrong — helps us decide whether to retry with a different client */ +export type TranscriptErrorCode = + | 'BOT_DETECTED' // YouTube thinks we're a bot + | 'AGE_RESTRICTED' // Video needs login for age verification + | 'VIDEO_UNAVAILABLE' // Deleted, private, or region-locked + | 'TRANSCRIPTS_DISABLED' // Video has no captions at all + | 'NO_TRANSCRIPT' // Requested language not available + | 'IP_BLOCKED' // Rate limited (429) or reCAPTCHA + | 'PAGE_FETCH_FAILED' // Couldn't extract API key from page HTML + | 'INNERTUBE_REJECTED' // InnerTube API returned an error + | 'CAPTION_FETCH_FAILED' // Got caption URL but couldn't download it + | 'UNKNOWN'; + export class TranscriptProviderError extends Error { - constructor(message: string) { + code: TranscriptErrorCode; + constructor(code: TranscriptErrorCode, message: string) { super(message); this.name = 'TranscriptProviderError'; + this.code = code; } } -const INNERTUBE_PLAYER_URL = 'https://www.youtube.com/youtubei/v1/player?prettyPrint=false'; -const ANDROID_CLIENT_VERSION = '20.10.38'; -const ANDROID_USER_AGENT = `com.google.android.youtube/${ANDROID_CLIENT_VERSION} (Linux; U; Android 14)`; +// ─── Client Identities ───────────────────────────────────────────────────── +// Each identity mimics a different YouTube client. YouTube's bot detection +// treats them differently, so if one is blocked, another might work. -const PLAYER_RESPONSE_MARKERS = [ - 'var ytInitialPlayerResponse =', - 'ytInitialPlayerResponse =', - 'window["ytInitialPlayerResponse"] =', - 'window[\'ytInitialPlayerResponse\'] =', +interface ClientIdentity { + name: string; + clientName: string; + clientVersion: string; + userAgent: string; + // Some clients need an API key from the page, others use a hardcoded one + apiKey?: string; +} + +const CLIENTS: ClientIdentity[] = [ + { + name: 'Android', + clientName: 'ANDROID', + clientVersion: '20.10.38', + userAgent: 'com.google.android.youtube/20.10.38 (Linux; U; Android 14; en_US; Pixel 8 Pro Build/UD1A.231105.004) gzip', + apiKey: 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + }, + { + name: 'Web', + clientName: 'WEB', + clientVersion: '2.20250326.00.00', + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', + }, + { + name: 'iOS', + clientName: 'IOS', + clientVersion: '20.10.4', + userAgent: 'com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X)', + apiKey: 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', + }, ]; +// ─── HTML Entity Decoding ─────────────────────────────────────────────────── + const NAMED_HTML_ENTITIES: Record = { '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", + ''': "'", ' ': ' ', }; @@ -74,443 +103,479 @@ function decodeHtmlEntities(text: string): string { return text .replace(/&#x([0-9a-f]+);/gi, (_, hex: string) => String.fromCodePoint(parseInt(hex, 16))) .replace(/&#(\d+);/g, (_, decimal: string) => String.fromCodePoint(parseInt(decimal, 10))) - .replace(/&(amp|lt|gt|quot|nbsp);|'/g, (entity) => NAMED_HTML_ENTITIES[entity] ?? entity); + .replace(/&(amp|lt|gt|quot|apos|nbsp);|'/g, (entity) => NAMED_HTML_ENTITIES[entity] ?? entity); } -function normalizeLanguageCode(code: string): string { - return code.trim().toLowerCase(); -} +// ─── Page Scraping ────────────────────────────────────────────────────────── +// We need to scrape the YouTube watch page to get the InnerTube API key and +// client version. These are embedded in the page's JavaScript. -function getLanguageRoot(code: string): string { - return normalizeLanguageCode(code).split(/[-_]/)[0] ?? normalizeLanguageCode(code); +interface PageData { + apiKey: string; + clientVersion: string; + visitorData: string; } -function isManualTrack(track: CaptionTrack): boolean { - return track.kind !== 'asr'; -} +/** + * Fetches the YouTube watch page and extracts the internal API credentials. + * Also handles the EU cookie consent page (YouTube shows a consent form + * instead of the real page if you don't have cookies). + */ +async function scrapeWatchPage(videoId: string): Promise { + const url = `https://www.youtube.com/watch?v=${videoId}`; + const headers: Record = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', + 'Accept-Language': 'en-US,en;q=0.9', + }; -function extractTrackName(name?: CaptionTrackName): string { - if (!name) return 'Unknown'; - if (typeof name.simpleText === 'string' && name.simpleText.trim()) { - return name.simpleText.trim(); + let html: string; + try { + const resp = await fetch(url, { headers, redirect: 'follow' }); + html = await resp.text(); + } catch (err) { + throw new TranscriptProviderError('PAGE_FETCH_FAILED', `Failed to fetch YouTube page: ${err}`); } - const combined = name.runs - ?.map((run) => run.text?.trim() ?? '') - .join('') - .trim(); - - return combined || 'Unknown'; -} - -function extractJsonObjectAfterMarker(html: string, marker: string): string | null { - const markerIndex = html.indexOf(marker); - if (markerIndex === -1) return null; - - const objectStart = html.indexOf('{', markerIndex + marker.length); - if (objectStart === -1) return null; - - let depth = 0; - let inString = false; - let isEscaped = false; - - for (let index = objectStart; index < html.length; index++) { - const character = html[index]; - - if (inString) { - if (isEscaped) { - isEscaped = false; - continue; - } - - if (character === '\\') { - isEscaped = true; - continue; - } - - if (character === '"') { - inString = false; - } - - continue; - } - - if (character === '"') { - inString = true; - continue; - } - - if (character === '{') { - depth += 1; - continue; - } - - if (character === '}') { - depth -= 1; - - if (depth === 0) { - return html.slice(objectStart, index + 1); + // Handle EU cookie consent — YouTube redirects to a consent page + // If we detect it, we extract the consent token and re-fetch with a cookie + if (html.includes('action="https://consent.youtube.com/s"')) { + console.log('[YT-TRANSCRIPT] Handling EU cookie consent redirect'); + const consentMatch = html.match(/name="v" value="(.*?)"/); + if (consentMatch) { + const consentValue = consentMatch[1]; + try { + const resp2 = await fetch(url, { + headers: { + ...headers, + 'Cookie': `CONSENT=YES+${consentValue}`, + }, + redirect: 'follow', + }); + html = await resp2.text(); + } catch { + // If consent retry fails, continue with the original HTML } } } - return null; -} + // Extract the three values we need from the page's JavaScript + const apiKeyMatch = html.match(/"INNERTUBE_API_KEY"\s*:\s*"([^"]+)"/); + const clientVersionMatch = html.match(/"INNERTUBE_CLIENT_VERSION"\s*:\s*"([^"]+)"/); + const visitorDataMatch = html.match(/"VISITOR_DATA"\s*:\s*"([^"]+)"/); -function extractPlayerResponse(html: string): PlayerResponse | null { - for (const marker of PLAYER_RESPONSE_MARKERS) { - const jsonText = extractJsonObjectAfterMarker(html, marker); - if (!jsonText) continue; - - try { - return JSON.parse(jsonText) as PlayerResponse; - } catch { - continue; + if (!apiKeyMatch) { + // Check if video exists at all + if (html.includes('"playabilityStatus":{"status":"ERROR"')) { + throw new TranscriptProviderError('VIDEO_UNAVAILABLE', 'Video is unavailable'); } - } - - return null; -} - -function dedupeTracks(tracks: CaptionTrack[]): CaptionTrack[] { - const seen = new Set(); - - return tracks.filter((track) => { - const key = `${track.languageCode}:${track.kind ?? 'manual'}:${track.baseUrl}`; - if (seen.has(key)) { - return false; + if (html.includes('Sign in to confirm your age') || html.includes('"LOGIN_REQUIRED"')) { + throw new TranscriptProviderError('AGE_RESTRICTED', 'Video is age-restricted'); } - - seen.add(key); - return true; - }); -} - -function mapCaptionTracks(rawTracks: CaptionTrackRenderer[] | undefined): CaptionTrack[] { - if (!Array.isArray(rawTracks) || rawTracks.length === 0) { - return []; + throw new TranscriptProviderError('PAGE_FETCH_FAILED', 'Could not extract INNERTUBE_API_KEY from page'); } - return dedupeTracks( - rawTracks.flatMap((track) => { - if (typeof track.baseUrl !== 'string' || typeof track.languageCode !== 'string') { - return []; - } - - return [{ - baseUrl: track.baseUrl, - languageCode: track.languageCode, - kind: typeof track.kind === 'string' ? track.kind : undefined, - name: extractTrackName(track.name), - }]; - }) - ); + return { + apiKey: apiKeyMatch[1], + clientVersion: clientVersionMatch?.[1] || '2.20250326.00.00', + visitorData: visitorDataMatch?.[1] || '', + }; } -function extractCaptionTracksFromPlayerResponse(playerResponse: PlayerResponse | null): CaptionTrack[] { - return mapCaptionTracks(playerResponse?.captions?.playerCaptionsTracklistRenderer?.captionTracks); -} +// ─── InnerTube API ────────────────────────────────────────────────────────── -export function extractCaptionTracksFromWatchHtml(html: string): CaptionTrack[] { - const playerResponse = extractPlayerResponse(html); - return extractCaptionTracksFromPlayerResponse(playerResponse); +interface CaptionTrack { + baseUrl: string; + languageCode: string; + name: string; + kind?: string; // "asr" = auto-generated } -function getTrackPriority(track: CaptionTrack, preferredLanguage?: string): number { - let score = isManualTrack(track) ? 10 : 0; - - if (!preferredLanguage) { - return getLanguageRoot(track.languageCode) === 'en' ? score + 100 : score; +/** + * Calls YouTube's InnerTube Player API to get video metadata including caption tracks. + * This is the same API that YouTube's own apps use internally. + */ +async function fetchInnerTubePlayer( + videoId: string, + client: ClientIdentity, + pageData: PageData | null +): Promise { + // Use the client's hardcoded API key, or the one scraped from the page + const apiKey = client.apiKey || pageData?.apiKey; + if (!apiKey) { + throw new TranscriptProviderError('PAGE_FETCH_FAILED', `No API key available for ${client.name} client`); } - const normalizedPreferredLanguage = normalizeLanguageCode(preferredLanguage); - const normalizedTrackLanguage = normalizeLanguageCode(track.languageCode); + const endpoint = `https://www.youtube.com/youtubei/v1/player?key=${apiKey}`; + + // Build the request body — mimics what YouTube's own clients send + const body: Record = { + videoId, + context: { + client: { + clientName: client.clientName, + clientVersion: client.clientVersion, + userAgent: client.userAgent, + hl: 'en', + gl: 'US', + ...(pageData?.visitorData ? { visitorData: pageData.visitorData } : {}), + }, + }, + }; - if (normalizedTrackLanguage === normalizedPreferredLanguage) { - score += 200; - } else if (getLanguageRoot(track.languageCode) === getLanguageRoot(preferredLanguage)) { - score += 150; - } else if (getLanguageRoot(track.languageCode) === 'en') { - score += 100; + // Android and iOS clients need a "content check OK" flag + if (client.clientName === 'ANDROID' || client.clientName === 'IOS') { + body.contentCheckOk = true; + body.racyCheckOk = true; } - return score; -} - -export function buildCaptionTrackCandidates( - tracks: CaptionTrack[], - preferredLanguage?: string -): CaptionTrack[] { - return [...tracks] - .map((track, index) => ({ track, index })) - .sort((left, right) => { - const priorityDifference = getTrackPriority(right.track, preferredLanguage) - getTrackPriority(left.track, preferredLanguage); - if (priorityDifference !== 0) { - return priorityDifference; - } + const headers: Record = { + 'Content-Type': 'application/json', + 'User-Agent': client.userAgent, + }; - return left.index - right.index; - }) - .map(({ track }) => track); -} + let response: Response; + try { + response = await fetch(endpoint, { + method: 'POST', + headers, + body: JSON.stringify(body), + }); + } catch (err) { + throw new TranscriptProviderError('INNERTUBE_REJECTED', `InnerTube request failed for ${client.name}: ${err}`); + } -export function transformCaptionJsonToSegments(payload: CaptionJsonResponse): { text: string; start: number; duration: number }[] { - if (!Array.isArray(payload.events) || payload.events.length === 0) { - return []; + if (response.status === 429) { + throw new TranscriptProviderError('IP_BLOCKED', `Rate limited (429) with ${client.name} client`); } - return payload.events.flatMap((event) => { - if (!Array.isArray(event.segs) || event.segs.length === 0) { - return []; - } + if (!response.ok) { + throw new TranscriptProviderError('INNERTUBE_REJECTED', `InnerTube returned ${response.status} for ${client.name}`); + } - const text = decodeHtmlEntities( - event.segs - .map((segment) => segment.utf8 ?? '') - .join('') - .replace(/\n/g, ' ') - .trim() - ).trim(); + let data: Record; + try { + data = await response.json() as Record; + } catch { + throw new TranscriptProviderError('INNERTUBE_REJECTED', `Invalid JSON from InnerTube for ${client.name}`); + } - if (!text) { - return []; + // Check for playability errors + const playabilityStatus = data.playabilityStatus as Record | undefined; + if (playabilityStatus) { + const status = playabilityStatus.status as string; + if (status === 'ERROR' || status === 'UNPLAYABLE') { + throw new TranscriptProviderError('VIDEO_UNAVAILABLE', `Video is ${status.toLowerCase()}`); } - - return [{ - text, - start: (event.tStartMs ?? 0) / 1000, - duration: Math.max((event.dDurationMs ?? 0) / 1000, 0), - }]; - }); -} - -export function transformCaptionXmlToSegments(xmlText: string): { text: string; start: number; duration: number }[] { - const srvSegments: { text: string; start: number; duration: number }[] = []; - const srvParagraphRegex = /]*>([\s\S]*?)<\/p>/g; - let match: RegExpExecArray | null; - - while ((match = srvParagraphRegex.exec(xmlText)) !== null) { - const paragraphBody = match[3].replace(//gi, ' '); - const segmentMatches = [...paragraphBody.matchAll(/]*>([\s\S]*?)<\/s>/g)]; - const rawText = segmentMatches.length > 0 - ? segmentMatches.map((segment) => segment[1]).join('') - : paragraphBody.replace(/<[^>]+>/g, ''); - const text = decodeHtmlEntities(rawText).trim(); - - if (!text) { - continue; + if (status === 'LOGIN_REQUIRED') { + const reason = (playabilityStatus.reason as string) || ''; + if (reason.includes('age') || reason.includes('Sign in')) { + throw new TranscriptProviderError('AGE_RESTRICTED', 'Video is age-restricted'); + } + throw new TranscriptProviderError('BOT_DETECTED', `Login required: ${reason}`); } + } - srvSegments.push({ - text, - start: parseInt(match[1], 10) / 1000, - duration: parseInt(match[2], 10) / 1000, - }); + // Extract caption tracks from the response + const captions = data.captions as Record | undefined; + if (!captions) { + throw new TranscriptProviderError('TRANSCRIPTS_DISABLED', 'No captions object in InnerTube response'); } - if (srvSegments.length > 0) { - return srvSegments; + const tracklistRenderer = captions.playerCaptionsTracklistRenderer as Record | undefined; + if (!tracklistRenderer) { + throw new TranscriptProviderError('TRANSCRIPTS_DISABLED', 'No caption tracklist renderer'); } - const legacySegments: { text: string; start: number; duration: number }[] = []; - const legacyTextRegex = /([\s\S]*?)<\/text>/g; + const captionTracks = tracklistRenderer.captionTracks as Array> | undefined; + if (!captionTracks || captionTracks.length === 0) { + throw new TranscriptProviderError('TRANSCRIPTS_DISABLED', 'No caption tracks available'); + } - while ((match = legacyTextRegex.exec(xmlText)) !== null) { - const text = decodeHtmlEntities(match[3]).trim(); - if (!text) { - continue; - } + // Map raw tracks to our CaptionTrack interface + return captionTracks + .filter(t => typeof t.baseUrl === 'string' && typeof t.languageCode === 'string') + .map(t => { + // Extract track name from either simpleText or runs format + const nameObj = t.name as Record | undefined; + let name = 'Unknown'; + if (nameObj) { + if (typeof nameObj.simpleText === 'string') { + name = nameObj.simpleText; + } else if (Array.isArray(nameObj.runs)) { + name = (nameObj.runs as Array<{ text?: string }>) + .map(r => r.text || '') + .join(''); + } + } - legacySegments.push({ - text, - start: parseFloat(match[1]), - duration: parseFloat(match[2]), + return { + baseUrl: t.baseUrl as string, + languageCode: t.languageCode as string, + kind: typeof t.kind === 'string' ? t.kind : undefined, + name, + }; }); - } - - return legacySegments; } -function calculateTranscriptDuration(segments: { start: number; duration: number }[]): number { - if (segments.length === 0) { - return 0; +// ─── Caption XML Parsing ──────────────────────────────────────────────────── + +/** + * Parses YouTube's caption XML format into transcript segments. + * + * YouTube uses TWO different XML formats depending on the client/track: + * + * Format 1 (newer / fmt=3, used by InnerTube): + *

Hello world

+ * (timestamps in MILLISECONDS) + * + * Format 2 (older / fmt=1): + * Hello world + * (timestamps in SECONDS) + */ +function parseCaptionXml(xml: string): { text: string; start: number; duration: number }[] { + const segments: { text: string; start: number; duration: number }[] = []; + + // Helper to clean caption text + function cleanText(raw: string): string { + return decodeHtmlEntities( + raw + .replace(/<[^>]*>/g, '') // Strip nested HTML tags (e.g. , ) + .replace(/\n/g, ' ') + ).trim(); } - const lastSegment = segments[segments.length - 1]; - return lastSegment.start + lastSegment.duration; -} + // Try Format 1 first:

text

(milliseconds) + const pRegex = /]*>([\s\S]*?)<\/p>/g; + let match; + let foundP = false; -function dedupeLanguages(tracks: CaptionTrack[]): string[] { - const seen = new Set(); + while ((match = pRegex.exec(xml)) !== null) { + foundP = true; + // t and d are in milliseconds — divide by 1000 to get seconds + const start = (parseFloat(match[1]) || 0) / 1000; + const duration = (parseFloat(match[2]) || 0) / 1000; + const text = cleanText(match[3] || ''); - return tracks.flatMap((track) => { - const normalized = normalizeLanguageCode(track.languageCode); - if (seen.has(normalized)) { - return []; + if (text) { + segments.push({ text, start, duration }); } + } - seen.add(normalized); - return [track.languageCode]; - }); -} + if (foundP) return segments; -async function fetchWatchHtml(videoId: string): Promise { - const response = await fetch(`https://www.youtube.com/watch?v=${videoId}&hl=en&persist_hl=1`, { - headers: { - 'Accept-Language': 'en-US,en;q=0.9', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36', - }, - cache: 'no-store', - }); + // Fallback to Format 2: text (seconds) + const textRegex = /]*>([\s\S]*?)<\/text>/g; - if (!response.ok) { - throw new Error(`Failed to load YouTube watch page (${response.status})`); + while ((match = textRegex.exec(xml)) !== null) { + const start = parseFloat(match[1]) || 0; + const duration = parseFloat(match[2]) || 0; + const text = cleanText(match[3] || ''); + + if (text) { + segments.push({ text, start, duration }); + } } - return response.text(); + return segments; } -async function fetchCaptionTracksFromInnerTube(videoId: string): Promise { - const response = await fetch(INNERTUBE_PLAYER_URL, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'User-Agent': ANDROID_USER_AGENT, - }, - body: JSON.stringify({ - context: { - client: { - clientName: 'ANDROID', - clientVersion: ANDROID_CLIENT_VERSION, - }, +/** + * Downloads a caption track from the given URL and parses it. + */ +async function fetchCaptionTrack(baseUrl: string): Promise<{ text: string; start: number; duration: number }[]> { + // Ensure we get XML format + const url = baseUrl.includes('fmt=') ? baseUrl : `${baseUrl}&fmt=3`; + + let response: Response; + try { + response = await fetch(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', + 'Accept-Language': 'en-US,en;q=0.9', }, - videoId, - }), - }); + }); + } catch (err) { + throw new TranscriptProviderError('CAPTION_FETCH_FAILED', `Failed to download caption track: ${err}`); + } if (!response.ok) { - throw new Error(`Failed to load YouTube player data (${response.status})`); + throw new TranscriptProviderError('CAPTION_FETCH_FAILED', `Caption track returned ${response.status}`); } - const playerResponse = (await response.json()) as PlayerResponse; - return extractCaptionTracksFromPlayerResponse(playerResponse); + const xml = await response.text(); + return parseCaptionXml(xml); } -async function fetchTrackSegments(track: CaptionTrack): Promise<{ text: string; start: number; duration: number }[]> { - const response = await fetch(track.baseUrl, { - headers: { - 'Accept-Language': 'en-US,en;q=0.9', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36,gzip(gfe)', - }, - cache: 'no-store', - }); +// ─── Language Selection ───────────────────────────────────────────────────── + +/** + * Picks the best caption track based on the user's language preference. + * + * Priority: + * 1. Manual (human-created) captions in the requested language + * 2. Auto-generated captions in the requested language + * 3. Manual captions in any language (prefer English) + * 4. Auto-generated captions in any language (prefer English) + */ +function selectBestTrack( + tracks: CaptionTrack[], + preferredLang?: string +): CaptionTrack | null { + if (tracks.length === 0) return null; + + // Separate manual vs auto-generated tracks + const manual = tracks.filter(t => t.kind !== 'asr'); + const auto = tracks.filter(t => t.kind === 'asr'); + + // Helper: find a track matching a language code + const findByLang = (list: CaptionTrack[], lang: string) => + list.find(t => t.languageCode === lang) || + list.find(t => t.languageCode.startsWith(lang.split('-')[0])); + + // If user requested a specific language, try that first + if (preferredLang) { + const manualMatch = findByLang(manual, preferredLang); + if (manualMatch) return manualMatch; + const autoMatch = findByLang(auto, preferredLang); + if (autoMatch) return autoMatch; + } - if (!response.ok) { - throw new Error(`Failed to load caption track (${response.status})`); + // No preference or preferred not found — pick the best available + if (manual.length > 0) { + const englishManual = findByLang(manual, 'en'); + return englishManual || manual[0]; } - const xmlText = await response.text(); - if (!xmlText.trim()) { - throw new Error('Caption track response was empty'); + if (auto.length > 0) { + const englishAuto = findByLang(auto, 'en'); + return englishAuto || auto[0]; } - return transformCaptionXmlToSegments(xmlText); + return tracks[0]; } +// ─── Error Classification ─────────────────────────────────────────────────── + +/** + * Determines whether we should try the next client identity after an error. + * Some errors are about THIS client being blocked (try another), + * while others are about the VIDEO itself (no point retrying). + */ +function shouldTryNextClient(error: TranscriptProviderError): boolean { + switch (error.code) { + // Client-specific — a different client might work + case 'BOT_DETECTED': + case 'IP_BLOCKED': + case 'INNERTUBE_REJECTED': + case 'PAGE_FETCH_FAILED': + return true; + // Video-level — no point retrying + case 'VIDEO_UNAVAILABLE': + case 'AGE_RESTRICTED': + case 'TRANSCRIPTS_DISABLED': + case 'NO_TRANSCRIPT': + return false; + default: + return true; + } +} + +// ─── Main Entry Point ─────────────────────────────────────────────────────── + +/** + * Fetches a YouTube video's transcript using the InnerTube API. + * + * Tries three client identities in sequence (Android → Web → iOS). + * Each client mimics a different YouTube app, and YouTube's bot detection + * treats them differently. If one gets blocked, the next might work. + * + * @param videoId - The 11-character YouTube video ID + * @param preferredLanguage - Optional language code (e.g., 'en', 'zh', 'ja') + * @param expectedDuration - Optional expected video duration in seconds + * @returns The transcript segments, language info, and available languages + */ export async function fetchYouTubeTranscript( videoId: string, preferredLanguage?: string, expectedDuration?: number ): Promise { - let tracks: CaptionTrack[] = []; - + // Step 1: Scrape the watch page for InnerTube credentials + // (needed by the Web client; Android/iOS have hardcoded keys) + let pageData: PageData | null = null; try { - tracks = await fetchCaptionTracksFromInnerTube(videoId); - } catch (error) { - console.warn('[TRANSCRIPT] Failed to fetch caption tracks via InnerTube', { - videoId, - error: error instanceof Error ? error.message : String(error), - }); + pageData = await scrapeWatchPage(videoId); + } catch (err) { + // Page scraping failed — we can still try Android/iOS with hardcoded keys + console.warn('[YT-TRANSCRIPT] Page scraping failed, will try with hardcoded keys:', err); } - if (tracks.length === 0) { - const watchHtml = await fetchWatchHtml(videoId); - tracks = extractCaptionTracksFromWatchHtml(watchHtml); - } - - if (tracks.length === 0) { - return null; - } + // Step 2: Try each client identity until one works + let lastError: TranscriptProviderError | null = null; - const candidates = buildCaptionTrackCandidates(tracks, preferredLanguage); - const availableLanguages = dedupeLanguages(tracks); - const isPreferredMatch = (languageCode: string) => ( - !!preferredLanguage && getLanguageRoot(languageCode) === getLanguageRoot(preferredLanguage) - ); - let bestMatch: { - track: CaptionTrack; - segments: { text: string; start: number; duration: number }[]; - duration: number; - } | null = null; - let hadTrackFetchError = false; - - for (const track of candidates) { - if (preferredLanguage && !isPreferredMatch(track.languageCode)) { + for (const client of CLIENTS) { + // Web client needs the scraped page data for its API key + if (client.clientName === 'WEB' && !pageData?.apiKey) { + console.log(`[YT-TRANSCRIPT] Skipping ${client.name} client — no page data available`); continue; } + console.log(`[YT-TRANSCRIPT] Trying ${client.name} client for video ${videoId}`); + try { - const segments = await fetchTrackSegments(track); - if (segments.length === 0) { + // Step 2a: Get caption tracks from InnerTube + const captionTracks = await fetchInnerTubePlayer(videoId, client, pageData); + + console.log(`[YT-TRANSCRIPT] ${client.name} returned ${captionTracks.length} caption tracks:`, + captionTracks.map(t => `${t.languageCode}${t.kind === 'asr' ? ' (auto)' : ''}`).join(', ') + ); + + // Step 2b: Pick the best track for the requested language + const selectedTrack = selectBestTrack(captionTracks, preferredLanguage); + if (!selectedTrack) { + lastError = new TranscriptProviderError('NO_TRANSCRIPT', 'No suitable caption track found'); continue; } - const duration = calculateTranscriptDuration(segments); + console.log(`[YT-TRANSCRIPT] Selected track: ${selectedTrack.languageCode}${selectedTrack.kind === 'asr' ? ' (auto-generated)' : ' (manual)'}`); - if (preferredLanguage) { - return { - segments, - language: track.languageCode, - availableLanguages, - }; - } + // Step 2c: Download and parse the caption track + const segments = await fetchCaptionTrack(selectedTrack.baseUrl); - if (!bestMatch || duration > bestMatch.duration) { - bestMatch = { track, segments, duration }; + if (segments.length === 0) { + lastError = new TranscriptProviderError('CAPTION_FETCH_FAILED', 'Caption track returned empty'); + continue; } - const meetsCoverageThreshold = expectedDuration - ? duration >= expectedDuration * 0.5 - : duration >= 300 || candidates.length === 1; - - if (meetsCoverageThreshold) { - break; + console.log(`[YT-TRANSCRIPT] Successfully fetched ${segments.length} segments via ${client.name} client`); + + // Build the list of available languages from the caption tracks + const availableLanguages = [...new Set(captionTracks.map(t => t.languageCode))]; + + return { + segments, + language: selectedTrack.languageCode, + availableLanguages, + }; + + } catch (err) { + if (err instanceof TranscriptProviderError) { + lastError = err; + console.warn(`[YT-TRANSCRIPT] ${client.name} client failed:`, err.code, err.message); + + // If this error means the video itself is the problem, don't try other clients + if (!shouldTryNextClient(err)) { + return null; + } + } else { + lastError = new TranscriptProviderError('UNKNOWN', `${client.name} client threw: ${err}`); + console.warn(`[YT-TRANSCRIPT] ${client.name} client threw unexpected error:`, err); } - } catch (error) { - hadTrackFetchError = true; - console.warn('[TRANSCRIPT] Failed to fetch caption track', { - videoId, - languageCode: track.languageCode, - error: error instanceof Error ? error.message : String(error), - }); } } - if (preferredLanguage && hadTrackFetchError) { - throw new TranscriptProviderError(`Failed to fetch transcript for requested language: ${preferredLanguage}`); - } - - if (!bestMatch) { - if (hadTrackFetchError) { - throw new TranscriptProviderError('All caption track fetches failed'); - } - - return null; + // All clients failed + if (lastError) { + console.error(`[YT-TRANSCRIPT] All clients failed for ${videoId}. Last error:`, lastError.code, lastError.message); } - - return { - segments: bestMatch.segments, - language: bestMatch.track.languageCode, - availableLanguages, - }; + return null; }