diff --git a/lib/__tests__/youtube-transcript-provider.test.ts b/lib/__tests__/youtube-transcript-provider.test.ts
index f256c58..66ded7d 100644
--- a/lib/__tests__/youtube-transcript-provider.test.ts
+++ b/lib/__tests__/youtube-transcript-provider.test.ts
@@ -2,11 +2,8 @@ import test from 'node:test';
import assert from 'node:assert/strict';
import {
- buildCaptionTrackCandidates,
- extractCaptionTracksFromWatchHtml,
fetchYouTubeTranscript,
- transformCaptionJsonToSegments,
- transformCaptionXmlToSegments,
+ TranscriptProviderError,
} from '../youtube-transcript-provider';
function withMockFetch(
@@ -21,124 +18,177 @@ function withMockFetch(
});
}
-test('extractCaptionTracksFromWatchHtml returns caption tracks from player response', () => {
- const html = `
-
-
-
-
-
- `;
-
- const tracks = extractCaptionTracksFromWatchHtml(html);
-
- assert.deepEqual(tracks, [
- {
- baseUrl: 'https://example.com/en',
- languageCode: 'en',
- kind: undefined,
- name: 'English',
- },
- {
- baseUrl: 'https://example.com/fr-auto',
- languageCode: 'fr',
- kind: 'asr',
- name: 'Francais',
+// Minimal YouTube watch page HTML with INNERTUBE_API_KEY embedded
+// (our provider scrapes this before calling InnerTube)
+const FAKE_WATCH_PAGE = `
+
+`;
+
+test('fetchYouTubeTranscript returns transcript when Android client succeeds', async () => {
+ await withMockFetch(
+ async (input, init) => {
+ const url = typeof input === 'string' ? input : input.toString();
+
+ // Page scrape request
+ if (url.includes('youtube.com/watch')) {
+ return new Response(FAKE_WATCH_PAGE);
+ }
+
+ // InnerTube player request — return caption tracks
+ if (url.includes('/youtubei/v1/player')) {
+ return new Response(JSON.stringify({
+ playabilityStatus: { status: 'OK' },
+ captions: {
+ playerCaptionsTracklistRenderer: {
+ captionTracks: [
+ {
+ baseUrl: 'https://captions.test/en',
+ languageCode: 'en',
+ name: { simpleText: 'English' },
+ },
+ {
+ baseUrl: 'https://captions.test/fr',
+ languageCode: 'fr',
+ name: { simpleText: 'Francais' },
+ kind: 'asr',
+ },
+ ],
+ },
+ },
+ }));
+ }
+
+ // Caption track fetch — return XML with format (milliseconds)
+ if (url.startsWith('https://captions.test/en')) {
+ return new Response(`
+ hello & welcome
+ 'quoted'
+ `);
+ }
+
+ throw new Error(`Unexpected fetch URL: ${url}`);
},
- ]);
-});
+ async () => {
+ const result = await fetchYouTubeTranscript('video123');
-test('buildCaptionTrackCandidates prioritizes requested language and manual tracks', () => {
- const tracks = [
- { baseUrl: 'https://example.com/en-auto', languageCode: 'en', kind: 'asr', name: 'English auto' },
- { baseUrl: 'https://example.com/fr-auto', languageCode: 'fr', kind: 'asr', name: 'Francais auto' },
- { baseUrl: 'https://example.com/fr', languageCode: 'fr', kind: undefined, name: 'Francais' },
- { baseUrl: 'https://example.com/de', languageCode: 'de', kind: undefined, name: 'Deutsch' },
- ];
-
- const candidates = buildCaptionTrackCandidates(tracks, 'fr');
-
- assert.deepEqual(
- candidates.map((track) => track.baseUrl),
- [
- 'https://example.com/fr',
- 'https://example.com/fr-auto',
- 'https://example.com/en-auto',
- 'https://example.com/de',
- ]
+ assert.ok(result, 'Should return a result');
+ assert.equal(result.language, 'en');
+ assert.deepEqual(result.availableLanguages, ['en', 'fr']);
+ assert.equal(result.segments.length, 2);
+ assert.equal(result.segments[0].text, 'hello & welcome');
+ assert.equal(result.segments[0].start, 0.42);
+ assert.equal(result.segments[0].duration, 4.2);
+ assert.equal(result.segments[1].text, "'quoted'");
+ }
);
});
-test('transformCaptionJsonToSegments decodes entities and ignores empty events', () => {
- const segments = transformCaptionJsonToSegments({
- events: [
- {
- tStartMs: 1500,
- dDurationMs: 2500,
- segs: [{ utf8: 'Hello & ' }, { utf8: 'welcome' }],
- },
- {
- tStartMs: 4000,
- dDurationMs: 1000,
- },
- {
- tStartMs: 5000,
- dDurationMs: 1250,
- segs: [{ utf8: ''quoted'' }],
- },
- ],
- });
+test('fetchYouTubeTranscript prefers requested language', async () => {
+ await withMockFetch(
+ async (input) => {
+ const url = typeof input === 'string' ? input : input.toString();
- assert.deepEqual(segments, [
- {
- text: 'Hello & welcome',
- start: 1.5,
- duration: 2.5,
- },
- {
- text: "'quoted'",
- start: 5,
- duration: 1.25,
+ if (url.includes('youtube.com/watch')) {
+ return new Response(FAKE_WATCH_PAGE);
+ }
+
+ if (url.includes('/youtubei/v1/player')) {
+ return new Response(JSON.stringify({
+ playabilityStatus: { status: 'OK' },
+ captions: {
+ playerCaptionsTracklistRenderer: {
+ captionTracks: [
+ {
+ baseUrl: 'https://captions.test/en',
+ languageCode: 'en',
+ name: { simpleText: 'English' },
+ },
+ {
+ baseUrl: 'https://captions.test/fr',
+ languageCode: 'fr',
+ name: { simpleText: 'Francais' },
+ },
+ ],
+ },
+ },
+ }));
+ }
+
+ // Should request French since we asked for it
+ if (url.startsWith('https://captions.test/fr')) {
+ return new Response(`
+ bonjour
+ `);
+ }
+
+ if (url.startsWith('https://captions.test/en')) {
+ return new Response(`
+ hello
+ `);
+ }
+
+ throw new Error(`Unexpected fetch URL: ${url}`);
},
- ]);
+ async () => {
+ const result = await fetchYouTubeTranscript('video123', 'fr');
+
+ assert.ok(result);
+ assert.equal(result.language, 'fr');
+ assert.equal(result.segments[0].text, 'bonjour');
+ }
+ );
});
-test('transformCaptionXmlToSegments parses youtube timedtext xml', () => {
- const xml = `hello & welcome'quoted'`;
+test('fetchYouTubeTranscript returns null when video has no captions', async () => {
+ await withMockFetch(
+ async (input) => {
+ const url = typeof input === 'string' ? input : input.toString();
- const segments = transformCaptionXmlToSegments(xml);
+ if (url.includes('youtube.com/watch')) {
+ return new Response(FAKE_WATCH_PAGE);
+ }
- assert.deepEqual(segments, [
- {
- text: 'hello & welcome',
- start: 0.42,
- duration: 4.2,
- },
- {
- text: "'quoted'",
- start: 5.1,
- duration: 1.5,
+ if (url.includes('/youtubei/v1/player')) {
+ // No captions object at all
+ return new Response(JSON.stringify({
+ playabilityStatus: { status: 'OK' },
+ }));
+ }
+
+ throw new Error(`Unexpected fetch URL: ${url}`);
},
- ]);
+ async () => {
+ const result = await fetchYouTubeTranscript('video123');
+ assert.equal(result, null);
+ }
+ );
});
-test('fetchYouTubeTranscript preserves an explicitly requested language', async () => {
+test('fetchYouTubeTranscript tries next client when one is rate-limited', async () => {
+ let innerTubeCallCount = 0;
+
await withMockFetch(
async (input) => {
const url = typeof input === 'string' ? input : input.toString();
+ if (url.includes('youtube.com/watch')) {
+ return new Response(FAKE_WATCH_PAGE);
+ }
+
if (url.includes('/youtubei/v1/player')) {
+ innerTubeCallCount++;
+ // First call (Android) returns 429 rate limit
+ if (innerTubeCallCount === 1) {
+ return new Response('Too Many Requests', { status: 429 });
+ }
+ // Second call (Web) succeeds
return new Response(JSON.stringify({
+ playabilityStatus: { status: 'OK' },
captions: {
playerCaptionsTracklistRenderer: {
captionTracks: [
- {
- baseUrl: 'https://captions.test/fr',
- languageCode: 'fr',
- name: { simpleText: 'Francais' },
- },
{
baseUrl: 'https://captions.test/en',
languageCode: 'en',
@@ -150,39 +200,37 @@ test('fetchYouTubeTranscript preserves an explicitly requested language', async
}));
}
- if (url === 'https://captions.test/fr') {
- return new Response('bonjour');
- }
-
- if (url === 'https://captions.test/en') {
- return new Response('hello');
+ if (url.startsWith('https://captions.test/en')) {
+ return new Response(`
+ hello from fallback
+ `);
}
throw new Error(`Unexpected fetch URL: ${url}`);
},
async () => {
- const result = await fetchYouTubeTranscript('video123', 'fr', 1200);
-
- assert.equal(result?.language, 'fr');
- assert.deepEqual(result?.availableLanguages, ['fr', 'en']);
- assert.deepEqual(result?.segments, [
- {
- text: 'bonjour',
- start: 0,
- duration: 1,
- },
- ]);
+ const result = await fetchYouTubeTranscript('video123');
+
+ assert.ok(result, 'Should succeed via fallback client');
+ assert.equal(result.segments[0].text, 'hello from fallback');
+ // Should have tried at least 2 InnerTube calls (Android failed, Web succeeded)
+ assert.ok(innerTubeCallCount >= 2, `Expected >= 2 InnerTube calls, got ${innerTubeCallCount}`);
}
);
});
-test('fetchYouTubeTranscript throws when caption tracks exist but all fetches fail', async () => {
+test('fetchYouTubeTranscript parses legacy XML format', async () => {
await withMockFetch(
async (input) => {
const url = typeof input === 'string' ? input : input.toString();
+ if (url.includes('youtube.com/watch')) {
+ return new Response(FAKE_WATCH_PAGE);
+ }
+
if (url.includes('/youtubei/v1/player')) {
return new Response(JSON.stringify({
+ playabilityStatus: { status: 'OK' },
captions: {
playerCaptionsTracklistRenderer: {
captionTracks: [
@@ -197,14 +245,25 @@ test('fetchYouTubeTranscript throws when caption tracks exist but all fetches fa
}));
}
- if (url === 'https://captions.test/en') {
- return new Response('', { status: 500 });
+ // Return legacy XML format (seconds, tags)
+ if (url.startsWith('https://captions.test/en')) {
+ return new Response(`
+
+ hello & welcome
+ goodbye
+ `);
}
throw new Error(`Unexpected fetch URL: ${url}`);
},
async () => {
- await assert.rejects(() => fetchYouTubeTranscript('video123'));
+ const result = await fetchYouTubeTranscript('video123');
+
+ assert.ok(result);
+ assert.equal(result.segments.length, 2);
+ assert.equal(result.segments[0].text, 'hello & welcome');
+ assert.equal(result.segments[0].start, 0.42);
+ assert.equal(result.segments[0].duration, 4.2);
}
);
});
diff --git a/lib/youtube-transcript-provider.ts b/lib/youtube-transcript-provider.ts
index 2a75142..018d140 100644
--- a/lib/youtube-transcript-provider.ts
+++ b/lib/youtube-transcript-provider.ts
@@ -1,72 +1,101 @@
-interface CaptionTrackName {
- simpleText?: string;
- runs?: Array<{ text?: string }>;
-}
-
-interface CaptionTrackRenderer {
- baseUrl?: string;
- languageCode?: string;
- kind?: string;
- name?: CaptionTrackName;
-}
-
-interface CaptionTrackListRenderer {
- captionTracks?: CaptionTrackRenderer[];
-}
-
-interface PlayerResponse {
- captions?: {
- playerCaptionsTracklistRenderer?: CaptionTrackListRenderer;
- };
-}
-
-interface CaptionEvent {
- tStartMs?: number;
- dDurationMs?: number;
- segs?: Array<{ utf8?: string }>;
-}
-
-export interface CaptionTrack {
- baseUrl: string;
- languageCode: string;
- kind?: string;
- name: string;
-}
-
-export interface CaptionJsonResponse {
- events?: CaptionEvent[];
-}
-
+/**
+ * YouTube Transcript Provider — fetches captions directly from YouTube
+ * without needing a paid API key.
+ *
+ * HOW IT WORKS:
+ * 1. Scrapes the YouTube watch page to extract YouTube's own internal API key
+ * (this is a public key embedded in every YouTube page, not a personal key)
+ * 2. Uses that key to call YouTube's InnerTube Player API, pretending to be
+ * a legitimate YouTube client (Android app, web browser, or iOS app)
+ * 3. The API response includes URLs to download the actual caption tracks
+ * 4. Downloads and parses the caption XML into transcript segments
+ *
+ * WHY MULTIPLE CLIENTS:
+ * YouTube actively blocks automated requests from server IPs. Different client
+ * identities have different bot-detection thresholds. If one gets blocked,
+ * we try the next. Think of it as having three disguises.
+ *
+ * FALLBACK CHAIN: Android → Web → iOS
+ *
+ * Based on the approach from github.com/JimLiu/baoyu-skills
+ */
+
+// ─── Types ──────────────────────────────────────────────────────────────────
+
+/** Result returned to the transcript route — matches the existing interface */
export interface TranscriptFetchResult {
segments: { text: string; start: number; duration: number }[];
language?: string;
availableLanguages: string[];
}
+/** What went wrong — helps us decide whether to retry with a different client */
+export type TranscriptErrorCode =
+ | 'BOT_DETECTED' // YouTube thinks we're a bot
+ | 'AGE_RESTRICTED' // Video needs login for age verification
+ | 'VIDEO_UNAVAILABLE' // Deleted, private, or region-locked
+ | 'TRANSCRIPTS_DISABLED' // Video has no captions at all
+ | 'NO_TRANSCRIPT' // Requested language not available
+ | 'IP_BLOCKED' // Rate limited (429) or reCAPTCHA
+ | 'PAGE_FETCH_FAILED' // Couldn't extract API key from page HTML
+ | 'INNERTUBE_REJECTED' // InnerTube API returned an error
+ | 'CAPTION_FETCH_FAILED' // Got caption URL but couldn't download it
+ | 'UNKNOWN';
+
export class TranscriptProviderError extends Error {
- constructor(message: string) {
+ code: TranscriptErrorCode;
+ constructor(code: TranscriptErrorCode, message: string) {
super(message);
this.name = 'TranscriptProviderError';
+ this.code = code;
}
}
-const INNERTUBE_PLAYER_URL = 'https://www.youtube.com/youtubei/v1/player?prettyPrint=false';
-const ANDROID_CLIENT_VERSION = '20.10.38';
-const ANDROID_USER_AGENT = `com.google.android.youtube/${ANDROID_CLIENT_VERSION} (Linux; U; Android 14)`;
+// ─── Client Identities ─────────────────────────────────────────────────────
+// Each identity mimics a different YouTube client. YouTube's bot detection
+// treats them differently, so if one is blocked, another might work.
-const PLAYER_RESPONSE_MARKERS = [
- 'var ytInitialPlayerResponse =',
- 'ytInitialPlayerResponse =',
- 'window["ytInitialPlayerResponse"] =',
- 'window[\'ytInitialPlayerResponse\'] =',
+interface ClientIdentity {
+ name: string;
+ clientName: string;
+ clientVersion: string;
+ userAgent: string;
+ // Some clients need an API key from the page, others use a hardcoded one
+ apiKey?: string;
+}
+
+const CLIENTS: ClientIdentity[] = [
+ {
+ name: 'Android',
+ clientName: 'ANDROID',
+ clientVersion: '20.10.38',
+ userAgent: 'com.google.android.youtube/20.10.38 (Linux; U; Android 14; en_US; Pixel 8 Pro Build/UD1A.231105.004) gzip',
+ apiKey: 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
+ },
+ {
+ name: 'Web',
+ clientName: 'WEB',
+ clientVersion: '2.20250326.00.00',
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
+ },
+ {
+ name: 'iOS',
+ clientName: 'IOS',
+ clientVersion: '20.10.4',
+ userAgent: 'com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X)',
+ apiKey: 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
+ },
];
+// ─── HTML Entity Decoding ───────────────────────────────────────────────────
+
const NAMED_HTML_ENTITIES: Record = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
''': "'",
+ ''': "'",
' ': ' ',
};
@@ -74,443 +103,479 @@ function decodeHtmlEntities(text: string): string {
return text
.replace(/([0-9a-f]+);/gi, (_, hex: string) => String.fromCodePoint(parseInt(hex, 16)))
.replace(/(\d+);/g, (_, decimal: string) => String.fromCodePoint(parseInt(decimal, 10)))
- .replace(/&(amp|lt|gt|quot|nbsp);|'/g, (entity) => NAMED_HTML_ENTITIES[entity] ?? entity);
+ .replace(/&(amp|lt|gt|quot|apos|nbsp);|'/g, (entity) => NAMED_HTML_ENTITIES[entity] ?? entity);
}
-function normalizeLanguageCode(code: string): string {
- return code.trim().toLowerCase();
-}
+// ─── Page Scraping ──────────────────────────────────────────────────────────
+// We need to scrape the YouTube watch page to get the InnerTube API key and
+// client version. These are embedded in the page's JavaScript.
-function getLanguageRoot(code: string): string {
- return normalizeLanguageCode(code).split(/[-_]/)[0] ?? normalizeLanguageCode(code);
+interface PageData {
+ apiKey: string;
+ clientVersion: string;
+ visitorData: string;
}
-function isManualTrack(track: CaptionTrack): boolean {
- return track.kind !== 'asr';
-}
+/**
+ * Fetches the YouTube watch page and extracts the internal API credentials.
+ * Also handles the EU cookie consent page (YouTube shows a consent form
+ * instead of the real page if you don't have cookies).
+ */
+async function scrapeWatchPage(videoId: string): Promise {
+ const url = `https://www.youtube.com/watch?v=${videoId}`;
+ const headers: Record = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
+ 'Accept-Language': 'en-US,en;q=0.9',
+ };
-function extractTrackName(name?: CaptionTrackName): string {
- if (!name) return 'Unknown';
- if (typeof name.simpleText === 'string' && name.simpleText.trim()) {
- return name.simpleText.trim();
+ let html: string;
+ try {
+ const resp = await fetch(url, { headers, redirect: 'follow' });
+ html = await resp.text();
+ } catch (err) {
+ throw new TranscriptProviderError('PAGE_FETCH_FAILED', `Failed to fetch YouTube page: ${err}`);
}
- const combined = name.runs
- ?.map((run) => run.text?.trim() ?? '')
- .join('')
- .trim();
-
- return combined || 'Unknown';
-}
-
-function extractJsonObjectAfterMarker(html: string, marker: string): string | null {
- const markerIndex = html.indexOf(marker);
- if (markerIndex === -1) return null;
-
- const objectStart = html.indexOf('{', markerIndex + marker.length);
- if (objectStart === -1) return null;
-
- let depth = 0;
- let inString = false;
- let isEscaped = false;
-
- for (let index = objectStart; index < html.length; index++) {
- const character = html[index];
-
- if (inString) {
- if (isEscaped) {
- isEscaped = false;
- continue;
- }
-
- if (character === '\\') {
- isEscaped = true;
- continue;
- }
-
- if (character === '"') {
- inString = false;
- }
-
- continue;
- }
-
- if (character === '"') {
- inString = true;
- continue;
- }
-
- if (character === '{') {
- depth += 1;
- continue;
- }
-
- if (character === '}') {
- depth -= 1;
-
- if (depth === 0) {
- return html.slice(objectStart, index + 1);
+ // Handle EU cookie consent — YouTube redirects to a consent page
+ // If we detect it, we extract the consent token and re-fetch with a cookie
+ if (html.includes('action="https://consent.youtube.com/s"')) {
+ console.log('[YT-TRANSCRIPT] Handling EU cookie consent redirect');
+ const consentMatch = html.match(/name="v" value="(.*?)"/);
+ if (consentMatch) {
+ const consentValue = consentMatch[1];
+ try {
+ const resp2 = await fetch(url, {
+ headers: {
+ ...headers,
+ 'Cookie': `CONSENT=YES+${consentValue}`,
+ },
+ redirect: 'follow',
+ });
+ html = await resp2.text();
+ } catch {
+ // If consent retry fails, continue with the original HTML
}
}
}
- return null;
-}
+ // Extract the three values we need from the page's JavaScript
+ const apiKeyMatch = html.match(/"INNERTUBE_API_KEY"\s*:\s*"([^"]+)"/);
+ const clientVersionMatch = html.match(/"INNERTUBE_CLIENT_VERSION"\s*:\s*"([^"]+)"/);
+ const visitorDataMatch = html.match(/"VISITOR_DATA"\s*:\s*"([^"]+)"/);
-function extractPlayerResponse(html: string): PlayerResponse | null {
- for (const marker of PLAYER_RESPONSE_MARKERS) {
- const jsonText = extractJsonObjectAfterMarker(html, marker);
- if (!jsonText) continue;
-
- try {
- return JSON.parse(jsonText) as PlayerResponse;
- } catch {
- continue;
+ if (!apiKeyMatch) {
+ // Check if video exists at all
+ if (html.includes('"playabilityStatus":{"status":"ERROR"')) {
+ throw new TranscriptProviderError('VIDEO_UNAVAILABLE', 'Video is unavailable');
}
- }
-
- return null;
-}
-
-function dedupeTracks(tracks: CaptionTrack[]): CaptionTrack[] {
- const seen = new Set();
-
- return tracks.filter((track) => {
- const key = `${track.languageCode}:${track.kind ?? 'manual'}:${track.baseUrl}`;
- if (seen.has(key)) {
- return false;
+ if (html.includes('Sign in to confirm your age') || html.includes('"LOGIN_REQUIRED"')) {
+ throw new TranscriptProviderError('AGE_RESTRICTED', 'Video is age-restricted');
}
-
- seen.add(key);
- return true;
- });
-}
-
-function mapCaptionTracks(rawTracks: CaptionTrackRenderer[] | undefined): CaptionTrack[] {
- if (!Array.isArray(rawTracks) || rawTracks.length === 0) {
- return [];
+ throw new TranscriptProviderError('PAGE_FETCH_FAILED', 'Could not extract INNERTUBE_API_KEY from page');
}
- return dedupeTracks(
- rawTracks.flatMap((track) => {
- if (typeof track.baseUrl !== 'string' || typeof track.languageCode !== 'string') {
- return [];
- }
-
- return [{
- baseUrl: track.baseUrl,
- languageCode: track.languageCode,
- kind: typeof track.kind === 'string' ? track.kind : undefined,
- name: extractTrackName(track.name),
- }];
- })
- );
+ return {
+ apiKey: apiKeyMatch[1],
+ clientVersion: clientVersionMatch?.[1] || '2.20250326.00.00',
+ visitorData: visitorDataMatch?.[1] || '',
+ };
}
-function extractCaptionTracksFromPlayerResponse(playerResponse: PlayerResponse | null): CaptionTrack[] {
- return mapCaptionTracks(playerResponse?.captions?.playerCaptionsTracklistRenderer?.captionTracks);
-}
+// ─── InnerTube API ──────────────────────────────────────────────────────────
-export function extractCaptionTracksFromWatchHtml(html: string): CaptionTrack[] {
- const playerResponse = extractPlayerResponse(html);
- return extractCaptionTracksFromPlayerResponse(playerResponse);
+interface CaptionTrack {
+ baseUrl: string;
+ languageCode: string;
+ name: string;
+ kind?: string; // "asr" = auto-generated
}
-function getTrackPriority(track: CaptionTrack, preferredLanguage?: string): number {
- let score = isManualTrack(track) ? 10 : 0;
-
- if (!preferredLanguage) {
- return getLanguageRoot(track.languageCode) === 'en' ? score + 100 : score;
+/**
+ * Calls YouTube's InnerTube Player API to get video metadata including caption tracks.
+ * This is the same API that YouTube's own apps use internally.
+ */
+async function fetchInnerTubePlayer(
+ videoId: string,
+ client: ClientIdentity,
+ pageData: PageData | null
+): Promise {
+ // Use the client's hardcoded API key, or the one scraped from the page
+ const apiKey = client.apiKey || pageData?.apiKey;
+ if (!apiKey) {
+ throw new TranscriptProviderError('PAGE_FETCH_FAILED', `No API key available for ${client.name} client`);
}
- const normalizedPreferredLanguage = normalizeLanguageCode(preferredLanguage);
- const normalizedTrackLanguage = normalizeLanguageCode(track.languageCode);
+ const endpoint = `https://www.youtube.com/youtubei/v1/player?key=${apiKey}`;
+
+ // Build the request body — mimics what YouTube's own clients send
+ const body: Record = {
+ videoId,
+ context: {
+ client: {
+ clientName: client.clientName,
+ clientVersion: client.clientVersion,
+ userAgent: client.userAgent,
+ hl: 'en',
+ gl: 'US',
+ ...(pageData?.visitorData ? { visitorData: pageData.visitorData } : {}),
+ },
+ },
+ };
- if (normalizedTrackLanguage === normalizedPreferredLanguage) {
- score += 200;
- } else if (getLanguageRoot(track.languageCode) === getLanguageRoot(preferredLanguage)) {
- score += 150;
- } else if (getLanguageRoot(track.languageCode) === 'en') {
- score += 100;
+ // Android and iOS clients need a "content check OK" flag
+ if (client.clientName === 'ANDROID' || client.clientName === 'IOS') {
+ body.contentCheckOk = true;
+ body.racyCheckOk = true;
}
- return score;
-}
-
-export function buildCaptionTrackCandidates(
- tracks: CaptionTrack[],
- preferredLanguage?: string
-): CaptionTrack[] {
- return [...tracks]
- .map((track, index) => ({ track, index }))
- .sort((left, right) => {
- const priorityDifference = getTrackPriority(right.track, preferredLanguage) - getTrackPriority(left.track, preferredLanguage);
- if (priorityDifference !== 0) {
- return priorityDifference;
- }
+ const headers: Record = {
+ 'Content-Type': 'application/json',
+ 'User-Agent': client.userAgent,
+ };
- return left.index - right.index;
- })
- .map(({ track }) => track);
-}
+ let response: Response;
+ try {
+ response = await fetch(endpoint, {
+ method: 'POST',
+ headers,
+ body: JSON.stringify(body),
+ });
+ } catch (err) {
+ throw new TranscriptProviderError('INNERTUBE_REJECTED', `InnerTube request failed for ${client.name}: ${err}`);
+ }
-export function transformCaptionJsonToSegments(payload: CaptionJsonResponse): { text: string; start: number; duration: number }[] {
- if (!Array.isArray(payload.events) || payload.events.length === 0) {
- return [];
+ if (response.status === 429) {
+ throw new TranscriptProviderError('IP_BLOCKED', `Rate limited (429) with ${client.name} client`);
}
- return payload.events.flatMap((event) => {
- if (!Array.isArray(event.segs) || event.segs.length === 0) {
- return [];
- }
+ if (!response.ok) {
+ throw new TranscriptProviderError('INNERTUBE_REJECTED', `InnerTube returned ${response.status} for ${client.name}`);
+ }
- const text = decodeHtmlEntities(
- event.segs
- .map((segment) => segment.utf8 ?? '')
- .join('')
- .replace(/\n/g, ' ')
- .trim()
- ).trim();
+ let data: Record;
+ try {
+ data = await response.json() as Record;
+ } catch {
+ throw new TranscriptProviderError('INNERTUBE_REJECTED', `Invalid JSON from InnerTube for ${client.name}`);
+ }
- if (!text) {
- return [];
+ // Check for playability errors
+ const playabilityStatus = data.playabilityStatus as Record | undefined;
+ if (playabilityStatus) {
+ const status = playabilityStatus.status as string;
+ if (status === 'ERROR' || status === 'UNPLAYABLE') {
+ throw new TranscriptProviderError('VIDEO_UNAVAILABLE', `Video is ${status.toLowerCase()}`);
}
-
- return [{
- text,
- start: (event.tStartMs ?? 0) / 1000,
- duration: Math.max((event.dDurationMs ?? 0) / 1000, 0),
- }];
- });
-}
-
-export function transformCaptionXmlToSegments(xmlText: string): { text: string; start: number; duration: number }[] {
- const srvSegments: { text: string; start: number; duration: number }[] = [];
- const srvParagraphRegex = /]*>([\s\S]*?)<\/p>/g;
- let match: RegExpExecArray | null;
-
- while ((match = srvParagraphRegex.exec(xmlText)) !== null) {
- const paragraphBody = match[3].replace(/
/gi, ' ');
- const segmentMatches = [...paragraphBody.matchAll(/]*>([\s\S]*?)<\/s>/g)];
- const rawText = segmentMatches.length > 0
- ? segmentMatches.map((segment) => segment[1]).join('')
- : paragraphBody.replace(/<[^>]+>/g, '');
- const text = decodeHtmlEntities(rawText).trim();
-
- if (!text) {
- continue;
+ if (status === 'LOGIN_REQUIRED') {
+ const reason = (playabilityStatus.reason as string) || '';
+ if (reason.includes('age') || reason.includes('Sign in')) {
+ throw new TranscriptProviderError('AGE_RESTRICTED', 'Video is age-restricted');
+ }
+ throw new TranscriptProviderError('BOT_DETECTED', `Login required: ${reason}`);
}
+ }
- srvSegments.push({
- text,
- start: parseInt(match[1], 10) / 1000,
- duration: parseInt(match[2], 10) / 1000,
- });
+ // Extract caption tracks from the response
+ const captions = data.captions as Record | undefined;
+ if (!captions) {
+ throw new TranscriptProviderError('TRANSCRIPTS_DISABLED', 'No captions object in InnerTube response');
}
- if (srvSegments.length > 0) {
- return srvSegments;
+ const tracklistRenderer = captions.playerCaptionsTracklistRenderer as Record | undefined;
+ if (!tracklistRenderer) {
+ throw new TranscriptProviderError('TRANSCRIPTS_DISABLED', 'No caption tracklist renderer');
}
- const legacySegments: { text: string; start: number; duration: number }[] = [];
- const legacyTextRegex = /([\s\S]*?)<\/text>/g;
+ const captionTracks = tracklistRenderer.captionTracks as Array> | undefined;
+ if (!captionTracks || captionTracks.length === 0) {
+ throw new TranscriptProviderError('TRANSCRIPTS_DISABLED', 'No caption tracks available');
+ }
- while ((match = legacyTextRegex.exec(xmlText)) !== null) {
- const text = decodeHtmlEntities(match[3]).trim();
- if (!text) {
- continue;
- }
+ // Map raw tracks to our CaptionTrack interface
+ return captionTracks
+ .filter(t => typeof t.baseUrl === 'string' && typeof t.languageCode === 'string')
+ .map(t => {
+ // Extract track name from either simpleText or runs format
+ const nameObj = t.name as Record | undefined;
+ let name = 'Unknown';
+ if (nameObj) {
+ if (typeof nameObj.simpleText === 'string') {
+ name = nameObj.simpleText;
+ } else if (Array.isArray(nameObj.runs)) {
+ name = (nameObj.runs as Array<{ text?: string }>)
+ .map(r => r.text || '')
+ .join('');
+ }
+ }
- legacySegments.push({
- text,
- start: parseFloat(match[1]),
- duration: parseFloat(match[2]),
+ return {
+ baseUrl: t.baseUrl as string,
+ languageCode: t.languageCode as string,
+ kind: typeof t.kind === 'string' ? t.kind : undefined,
+ name,
+ };
});
- }
-
- return legacySegments;
}
-function calculateTranscriptDuration(segments: { start: number; duration: number }[]): number {
- if (segments.length === 0) {
- return 0;
+// ─── Caption XML Parsing ────────────────────────────────────────────────────
+
+/**
+ * Parses YouTube's caption XML format into transcript segments.
+ *
+ * YouTube uses TWO different XML formats depending on the client/track:
+ *
+ * Format 1 (newer / fmt=3, used by InnerTube):
+ * Hello world
+ * (timestamps in MILLISECONDS)
+ *
+ * Format 2 (older / fmt=1):
+ * Hello world
+ * (timestamps in SECONDS)
+ */
+function parseCaptionXml(xml: string): { text: string; start: number; duration: number }[] {
+ const segments: { text: string; start: number; duration: number }[] = [];
+
+ // Helper to clean caption text
+ function cleanText(raw: string): string {
+ return decodeHtmlEntities(
+ raw
+ .replace(/<[^>]*>/g, '') // Strip nested HTML tags (e.g. , )
+ .replace(/\n/g, ' ')
+ ).trim();
}
- const lastSegment = segments[segments.length - 1];
- return lastSegment.start + lastSegment.duration;
-}
+ // Try Format 1 first: text
(milliseconds)
+ const pRegex = /]*>([\s\S]*?)<\/p>/g;
+ let match;
+ let foundP = false;
-function dedupeLanguages(tracks: CaptionTrack[]): string[] {
- const seen = new Set();
+ while ((match = pRegex.exec(xml)) !== null) {
+ foundP = true;
+ // t and d are in milliseconds — divide by 1000 to get seconds
+ const start = (parseFloat(match[1]) || 0) / 1000;
+ const duration = (parseFloat(match[2]) || 0) / 1000;
+ const text = cleanText(match[3] || '');
- return tracks.flatMap((track) => {
- const normalized = normalizeLanguageCode(track.languageCode);
- if (seen.has(normalized)) {
- return [];
+ if (text) {
+ segments.push({ text, start, duration });
}
+ }
- seen.add(normalized);
- return [track.languageCode];
- });
-}
+ if (foundP) return segments;
-async function fetchWatchHtml(videoId: string): Promise {
- const response = await fetch(`https://www.youtube.com/watch?v=${videoId}&hl=en&persist_hl=1`, {
- headers: {
- 'Accept-Language': 'en-US,en;q=0.9',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
- },
- cache: 'no-store',
- });
+ // Fallback to Format 2: text (seconds)
+ const textRegex = /]*>([\s\S]*?)<\/text>/g;
- if (!response.ok) {
- throw new Error(`Failed to load YouTube watch page (${response.status})`);
+ while ((match = textRegex.exec(xml)) !== null) {
+ const start = parseFloat(match[1]) || 0;
+ const duration = parseFloat(match[2]) || 0;
+ const text = cleanText(match[3] || '');
+
+ if (text) {
+ segments.push({ text, start, duration });
+ }
}
- return response.text();
+ return segments;
}
-async function fetchCaptionTracksFromInnerTube(videoId: string): Promise {
- const response = await fetch(INNERTUBE_PLAYER_URL, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- 'User-Agent': ANDROID_USER_AGENT,
- },
- body: JSON.stringify({
- context: {
- client: {
- clientName: 'ANDROID',
- clientVersion: ANDROID_CLIENT_VERSION,
- },
+/**
+ * Downloads a caption track from the given URL and parses it.
+ */
+async function fetchCaptionTrack(baseUrl: string): Promise<{ text: string; start: number; duration: number }[]> {
+ // Ensure we get XML format
+ const url = baseUrl.includes('fmt=') ? baseUrl : `${baseUrl}&fmt=3`;
+
+ let response: Response;
+ try {
+ response = await fetch(url, {
+ headers: {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
+ 'Accept-Language': 'en-US,en;q=0.9',
},
- videoId,
- }),
- });
+ });
+ } catch (err) {
+ throw new TranscriptProviderError('CAPTION_FETCH_FAILED', `Failed to download caption track: ${err}`);
+ }
if (!response.ok) {
- throw new Error(`Failed to load YouTube player data (${response.status})`);
+ throw new TranscriptProviderError('CAPTION_FETCH_FAILED', `Caption track returned ${response.status}`);
}
- const playerResponse = (await response.json()) as PlayerResponse;
- return extractCaptionTracksFromPlayerResponse(playerResponse);
+ const xml = await response.text();
+ return parseCaptionXml(xml);
}
-async function fetchTrackSegments(track: CaptionTrack): Promise<{ text: string; start: number; duration: number }[]> {
- const response = await fetch(track.baseUrl, {
- headers: {
- 'Accept-Language': 'en-US,en;q=0.9',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36,gzip(gfe)',
- },
- cache: 'no-store',
- });
+// ─── Language Selection ─────────────────────────────────────────────────────
+
+/**
+ * Picks the best caption track based on the user's language preference.
+ *
+ * Priority:
+ * 1. Manual (human-created) captions in the requested language
+ * 2. Auto-generated captions in the requested language
+ * 3. Manual captions in any language (prefer English)
+ * 4. Auto-generated captions in any language (prefer English)
+ */
+function selectBestTrack(
+ tracks: CaptionTrack[],
+ preferredLang?: string
+): CaptionTrack | null {
+ if (tracks.length === 0) return null;
+
+ // Separate manual vs auto-generated tracks
+ const manual = tracks.filter(t => t.kind !== 'asr');
+ const auto = tracks.filter(t => t.kind === 'asr');
+
+ // Helper: find a track matching a language code
+ const findByLang = (list: CaptionTrack[], lang: string) =>
+ list.find(t => t.languageCode === lang) ||
+ list.find(t => t.languageCode.startsWith(lang.split('-')[0]));
+
+ // If user requested a specific language, try that first
+ if (preferredLang) {
+ const manualMatch = findByLang(manual, preferredLang);
+ if (manualMatch) return manualMatch;
+ const autoMatch = findByLang(auto, preferredLang);
+ if (autoMatch) return autoMatch;
+ }
- if (!response.ok) {
- throw new Error(`Failed to load caption track (${response.status})`);
+ // No preference or preferred not found — pick the best available
+ if (manual.length > 0) {
+ const englishManual = findByLang(manual, 'en');
+ return englishManual || manual[0];
}
- const xmlText = await response.text();
- if (!xmlText.trim()) {
- throw new Error('Caption track response was empty');
+ if (auto.length > 0) {
+ const englishAuto = findByLang(auto, 'en');
+ return englishAuto || auto[0];
}
- return transformCaptionXmlToSegments(xmlText);
+ return tracks[0];
}
+// ─── Error Classification ───────────────────────────────────────────────────
+
+/**
+ * Determines whether we should try the next client identity after an error.
+ * Some errors are about THIS client being blocked (try another),
+ * while others are about the VIDEO itself (no point retrying).
+ */
+function shouldTryNextClient(error: TranscriptProviderError): boolean {
+ switch (error.code) {
+ // Client-specific — a different client might work
+ case 'BOT_DETECTED':
+ case 'IP_BLOCKED':
+ case 'INNERTUBE_REJECTED':
+ case 'PAGE_FETCH_FAILED':
+ return true;
+ // Video-level — no point retrying
+ case 'VIDEO_UNAVAILABLE':
+ case 'AGE_RESTRICTED':
+ case 'TRANSCRIPTS_DISABLED':
+ case 'NO_TRANSCRIPT':
+ return false;
+ default:
+ return true;
+ }
+}
+
+// ─── Main Entry Point ───────────────────────────────────────────────────────
+
+/**
+ * Fetches a YouTube video's transcript using the InnerTube API.
+ *
+ * Tries three client identities in sequence (Android → Web → iOS).
+ * Each client mimics a different YouTube app, and YouTube's bot detection
+ * treats them differently. If one gets blocked, the next might work.
+ *
+ * @param videoId - The 11-character YouTube video ID
+ * @param preferredLanguage - Optional language code (e.g., 'en', 'zh', 'ja')
+ * @param expectedDuration - Optional expected video duration in seconds
+ * @returns The transcript segments, language info, and available languages
+ */
export async function fetchYouTubeTranscript(
videoId: string,
preferredLanguage?: string,
expectedDuration?: number
): Promise {
- let tracks: CaptionTrack[] = [];
-
+ // Step 1: Scrape the watch page for InnerTube credentials
+ // (needed by the Web client; Android/iOS have hardcoded keys)
+ let pageData: PageData | null = null;
try {
- tracks = await fetchCaptionTracksFromInnerTube(videoId);
- } catch (error) {
- console.warn('[TRANSCRIPT] Failed to fetch caption tracks via InnerTube', {
- videoId,
- error: error instanceof Error ? error.message : String(error),
- });
+ pageData = await scrapeWatchPage(videoId);
+ } catch (err) {
+ // Page scraping failed — we can still try Android/iOS with hardcoded keys
+ console.warn('[YT-TRANSCRIPT] Page scraping failed, will try with hardcoded keys:', err);
}
- if (tracks.length === 0) {
- const watchHtml = await fetchWatchHtml(videoId);
- tracks = extractCaptionTracksFromWatchHtml(watchHtml);
- }
-
- if (tracks.length === 0) {
- return null;
- }
+ // Step 2: Try each client identity until one works
+ let lastError: TranscriptProviderError | null = null;
- const candidates = buildCaptionTrackCandidates(tracks, preferredLanguage);
- const availableLanguages = dedupeLanguages(tracks);
- const isPreferredMatch = (languageCode: string) => (
- !!preferredLanguage && getLanguageRoot(languageCode) === getLanguageRoot(preferredLanguage)
- );
- let bestMatch: {
- track: CaptionTrack;
- segments: { text: string; start: number; duration: number }[];
- duration: number;
- } | null = null;
- let hadTrackFetchError = false;
-
- for (const track of candidates) {
- if (preferredLanguage && !isPreferredMatch(track.languageCode)) {
+ for (const client of CLIENTS) {
+ // Web client needs the scraped page data for its API key
+ if (client.clientName === 'WEB' && !pageData?.apiKey) {
+ console.log(`[YT-TRANSCRIPT] Skipping ${client.name} client — no page data available`);
continue;
}
+ console.log(`[YT-TRANSCRIPT] Trying ${client.name} client for video ${videoId}`);
+
try {
- const segments = await fetchTrackSegments(track);
- if (segments.length === 0) {
+ // Step 2a: Get caption tracks from InnerTube
+ const captionTracks = await fetchInnerTubePlayer(videoId, client, pageData);
+
+ console.log(`[YT-TRANSCRIPT] ${client.name} returned ${captionTracks.length} caption tracks:`,
+ captionTracks.map(t => `${t.languageCode}${t.kind === 'asr' ? ' (auto)' : ''}`).join(', ')
+ );
+
+ // Step 2b: Pick the best track for the requested language
+ const selectedTrack = selectBestTrack(captionTracks, preferredLanguage);
+ if (!selectedTrack) {
+ lastError = new TranscriptProviderError('NO_TRANSCRIPT', 'No suitable caption track found');
continue;
}
- const duration = calculateTranscriptDuration(segments);
+ console.log(`[YT-TRANSCRIPT] Selected track: ${selectedTrack.languageCode}${selectedTrack.kind === 'asr' ? ' (auto-generated)' : ' (manual)'}`);
- if (preferredLanguage) {
- return {
- segments,
- language: track.languageCode,
- availableLanguages,
- };
- }
+ // Step 2c: Download and parse the caption track
+ const segments = await fetchCaptionTrack(selectedTrack.baseUrl);
- if (!bestMatch || duration > bestMatch.duration) {
- bestMatch = { track, segments, duration };
+ if (segments.length === 0) {
+ lastError = new TranscriptProviderError('CAPTION_FETCH_FAILED', 'Caption track returned empty');
+ continue;
}
- const meetsCoverageThreshold = expectedDuration
- ? duration >= expectedDuration * 0.5
- : duration >= 300 || candidates.length === 1;
-
- if (meetsCoverageThreshold) {
- break;
+ console.log(`[YT-TRANSCRIPT] Successfully fetched ${segments.length} segments via ${client.name} client`);
+
+ // Build the list of available languages from the caption tracks
+ const availableLanguages = [...new Set(captionTracks.map(t => t.languageCode))];
+
+ return {
+ segments,
+ language: selectedTrack.languageCode,
+ availableLanguages,
+ };
+
+ } catch (err) {
+ if (err instanceof TranscriptProviderError) {
+ lastError = err;
+ console.warn(`[YT-TRANSCRIPT] ${client.name} client failed:`, err.code, err.message);
+
+ // If this error means the video itself is the problem, don't try other clients
+ if (!shouldTryNextClient(err)) {
+ return null;
+ }
+ } else {
+ lastError = new TranscriptProviderError('UNKNOWN', `${client.name} client threw: ${err}`);
+ console.warn(`[YT-TRANSCRIPT] ${client.name} client threw unexpected error:`, err);
}
- } catch (error) {
- hadTrackFetchError = true;
- console.warn('[TRANSCRIPT] Failed to fetch caption track', {
- videoId,
- languageCode: track.languageCode,
- error: error instanceof Error ? error.message : String(error),
- });
}
}
- if (preferredLanguage && hadTrackFetchError) {
- throw new TranscriptProviderError(`Failed to fetch transcript for requested language: ${preferredLanguage}`);
- }
-
- if (!bestMatch) {
- if (hadTrackFetchError) {
- throw new TranscriptProviderError('All caption track fetches failed');
- }
-
- return null;
+ // All clients failed
+ if (lastError) {
+ console.error(`[YT-TRANSCRIPT] All clients failed for ${videoId}. Last error:`, lastError.code, lastError.message);
}
-
- return {
- segments: bestMatch.segments,
- language: bestMatch.track.languageCode,
- availableLanguages,
- };
+ return null;
}