From 337c915a74d7fa3842ba2e3875e5ce71593a8542 Mon Sep 17 00:00:00 2001 From: eD Thomas Date: Sat, 4 Apr 2026 21:28:49 -0400 Subject: [PATCH] Add Instagram data liberation support Adds a complete Instagram-to-WordPress migration pipeline using the same CDP-based approach as Wix and Squarespace extractors. Scripts: - scripts/instagram/discover.js: Scroll-based GraphQL interception to inventory all posts with metadata, captions, timestamps, locations - scripts/instagram/extract.js: Per-post extraction with ?img_index=N for carousel slides, deduplication by Instagram media ID - scripts/instagram/import.js: XML-RPC import with wp.uploadFile for media, wp.newPost with featured images, gallery blocks for carousels, correct backdated post dates, and source links to original posts Also includes: - prompts/instagram.md: User-facing migration prompt - tests/instagram.test.js: 32 unit tests covering data transformation, XML-RPC encoding, carousel deduplication, and content generation - DISCOVERIES.md entry documenting key findings - Updated AGENTS.md, README.md, cli.js for Instagram support Tested against a 308-post profile: 364 media files uploaded, all posts imported with correct dates, unique carousel slides in gallery blocks, and featured images. Zero failures. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 2 + AGENTS.md | 69 ++++- DISCOVERIES.md | 28 ++ README.md | 31 +++ cli.js | 33 ++- prompts/instagram.md | 77 ++++++ scripts/instagram/discover.js | 487 +++++++++++++++++++++++++++++++++ scripts/instagram/extract.js | 487 +++++++++++++++++++++++++++++++++ scripts/instagram/import.js | 496 ++++++++++++++++++++++++++++++++++ tests/instagram.test.js | 444 ++++++++++++++++++++++++++++++ 10 files changed, 2145 insertions(+), 9 deletions(-) create mode 100644 prompts/instagram.md create mode 100644 scripts/instagram/discover.js create mode 100644 scripts/instagram/extract.js create mode 100644 scripts/instagram/import.js create mode 100644 tests/instagram.test.js diff --git a/.gitignore b/.gitignore index f7f1325..a914d7e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ output/ node_modules/ .DS_Store *.log +output/ +*.db diff --git a/AGENTS.md b/AGENTS.md index 7c04f0c..ae2fa6d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -11,7 +11,7 @@ This file is the entry point for any AI agent using this repository. Read it bef 3. **User prompts** — non-technical users paste these into their AI to drive the whole migration 4. **A living playbook** — this repo improves itself through AI-contributed discoveries -Currently supports: **Wix** and **Squarespace**. Webflow and Shopify are planned. +Currently supports: **Wix**, **Squarespace**, and **Instagram**. Webflow and Shopify are planned. ## If you're helping a user migrate from Wix @@ -136,6 +136,47 @@ Generate a redirect map (old paths → new WP paths) for the user to configure i --- +## If you're helping a user migrate from Instagram + +Instagram requires an authenticated browser session — there's no public API or sitemap to crawl. + +### Step 1 — Launch browser with CDP + +```bash +pkill -9 -f "Google Chrome" +sleep 3 +"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \ + --remote-debugging-port=9222 \ + --user-data-dir="$HOME/.data-liberation/cdp-profile/chrome" \ + --restore-last-session +``` + +### Step 2 — Discover all posts + +```bash +node scripts/instagram/discover.js USERNAME --cdp-port 9222 +``` + +Scrolls the profile and intercepts GraphQL responses to build a complete post inventory. Use `--delay 3000` for large profiles. + +### Step 3 — Extract content and media + +```bash +node scripts/instagram/extract.js USERNAME --cdp-port 9222 +``` + +For each post, navigates to the individual post URL, captures metadata, and downloads full-resolution images. For carousel posts, uses `?img_index=N` to access each slide directly. + +### Step 4 — Import to WordPress.com + +```bash +node scripts/instagram/import.js --site --user --token +``` + +Creates published posts with correct dates, featured images, gallery blocks for carousels, and source links back to Instagram. + +--- + ## Using Claude in Chrome MCP If the user has the Chrome DevTools MCP set up (`npx chrome-devtools-mcp@latest`), you can drive extraction directly from the browser without running scripts: @@ -183,6 +224,17 @@ This approach works for any JavaScript-heavy platform, not just Wix. | Admin UI noise in extracted content | Smart fallback heuristics filter admin shell text, sidebar artifacts | | Products/commerce | Extract metadata but skip import (WooCommerce out of scope) | +### Instagram + +| Problem | Solution | +|---|---| +| No public API or data export | Intercept GraphQL responses via CDP browser session | +| Authentication required | Connect to user's logged-in browser via `--cdp-port` | +| Carousel slides lazy-load | Use `?img_index=N` URL parameter to load each slide directly | +| CDN image URLs expire | Download media immediately during extraction | +| Rate limiting on scroll | Add `--delay 3000` for profiles with 200+ posts | +| Two API response formats | Handle both `edge_owner_to_timeline_media` and `xdt_api__v1__feed` shapes | + --- ## How to contribute improvements back @@ -224,7 +276,8 @@ data-liberation-agent/ ├── package.json ├── prompts/ │ ├── wix.md ← what users paste into their AI for a Wix migration -│ └── squarespace.md ← what users paste into their AI for a Squarespace migration +│ ├── squarespace.md ← what users paste into their AI for a Squarespace migration +│ └── instagram.md ← what users paste into their AI for an Instagram migration ├── scripts/ │ ├── wix/ │ │ ├── discover.js ← inventory the Wix site (sitemap + categorization) @@ -232,7 +285,11 @@ data-liberation-agent/ │ ├── squarespace/ │ │ ├── discover.js ← inventory via admin CDP or public JSON API │ │ ├── extract.js ← extract content via admin API interception + DOM fallback -│ │ └── import.js ← publish to WordPress.com via REST API +│ │ └── import.js ← publish to WordPress.com via XML-RPC +│ ├── instagram/ +│ │ ├── discover.js ← inventory an Instagram profile via GraphQL interception +│ │ ├── extract.js ← extract posts, carousel slides, and media +│ │ └── import.js ← publish to WordPress.com via XML-RPC │ └── import.js ← publish to WordPress.com via REST API (Wix) ├── examples/ │ ├── wix-api-blog-post.json ← example of Wix internal API response @@ -257,6 +314,12 @@ data-liberation-agent/ - **Password-protected pages**: Admin extraction may fail without credentials. - **Content is HTML, not blocks**: Imported as custom HTML. Block conversion is planned but not yet implemented. +### Instagram +- **Stories and Reels**: Not yet supported — different GraphQL queries and video handling required. +- **Comments**: Post comment text is not extracted (only comment counts). +- **Private profiles**: Only works for the user's own profile or profiles they follow. +- **Very large profiles (1000+ posts)**: May hit rate limiting. Use `--delay 4000` or higher. + ### General - Import creates everything as **drafts** — the user must review and publish manually. - WordPress.com Application Passwords must be created by the user (AI can't do this step). diff --git a/DISCOVERIES.md b/DISCOVERIES.md index 9183eba..10f7e10 100644 --- a/DISCOVERIES.md +++ b/DISCOVERIES.md @@ -6,6 +6,34 @@ AI agents: when you contribute an improvement, add an entry here. See [CONTRIBUT --- +## 2026-04-03 — Instagram data extraction via CDP and GraphQL interception + +**Found by:** Claude + human contributor (live testing against a real 308-post Instagram profile) +**During:** Building Instagram support for the data-liberation-agent +**Type:** API endpoint | content type | architecture + +### What I found + +Instagram is a React app that communicates via GraphQL queries to `https://www.instagram.com/graphql/query/`. By connecting to an authenticated browser via CDP and intercepting responses during profile scroll, we capture structured JSON for every post. Key discoveries: + +1. **Carousel slide direct access via `?img_index=N`**: Individual carousel slides can be loaded by appending `?img_index=1`, `?img_index=2`, etc. to the post URL. This is significantly more reliable than clicking through carousel arrows in the DOM. + +2. **Carousel DOM has 3 `
  • ` elements**: Instagram keeps previous, current, and next slides in the DOM simultaneously. Deduplication by Instagram media ID (the numeric prefix in CDN URLs like `/12345_67890.jpg`) is required to avoid capturing the same image from adjacent preloaded slides. + +3. **Scroll-based pagination is more reliable than direct GraphQL**: Making direct `fetch()` calls to the GraphQL endpoint triggers rate limiting. Scrolling the profile with 2-3 second delays lets Instagram's own IntersectionObserver trigger pagination naturally. + +4. **WordPress.com REST API doesn't support writes with app passwords**: Returns 401 for POST operations. XML-RPC (`wp.uploadFile`, `wp.newPost`) works correctly. The `post_date` must be sent as a `` in `"YYYY-MM-DD HH:MM:SS"` format — WordPress ignores `` typed values. + +### How it works + +Three-step pipeline: discover (scroll + intercept GraphQL) → extract (visit each post, use `?img_index=N` for carousels, download media) → import (XML-RPC `wp.uploadFile` for media, `wp.newPost` with `post_thumbnail` for featured images and gallery blocks for carousels). + +### Why it's better than the previous approach + +Instagram's built-in data export takes days, provides lower-resolution images, and has no location data. The CDP approach captures everything in real-time at full resolution with complete metadata. + +--- + ## 2026-04-02 — Squarespace admin extraction via CDP **Found by:** Claude + human contributor (live testing against a Squarespace site) diff --git a/README.md b/README.md index 788411c..2a89e32 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ This repo gives people a prompt they can paste into any AI assistant (Claude, Ch |---|---|---| | **Wix** | Ready | [`prompts/wix.md`](./prompts/wix.md) | | **Squarespace** | Ready | [`prompts/squarespace.md`](./prompts/squarespace.md) | +| **Instagram** | Ready | [`prompts/instagram.md`](./prompts/instagram.md) | | Webflow | Planned | — | | Shopify (blog/pages) | Planned | — | @@ -56,6 +57,26 @@ node scripts/squarespace/import.js --site your-wp-site \ --username your-user --token YOUR_APP_PASSWORD ``` +## Quick start (Instagram) + +```bash +# 1. Install dependencies +npm install + +# 2. Launch Chrome with remote debugging (Instagram requires an authenticated session) +"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \ + --remote-debugging-port=9222 --user-data-dir="$HOME/.data-liberation/cdp-profile/chrome" + +# 3. Log into Instagram in the browser, then discover all posts +node scripts/instagram/discover.js YOUR_USERNAME --cdp-port 9222 + +# 4. Extract content and download all media +node scripts/instagram/extract.js YOUR_USERNAME --cdp-port 9222 + +# 5. Import to WordPress.com +node scripts/instagram/import.js --site your-wp-site --user your-user --token YOUR_APP_PASSWORD +``` + Or skip all of that and **paste the prompt into your AI assistant** — it will handle everything. ## For AI agents @@ -85,6 +106,16 @@ This means the playbook gets smarter with every migration. - [ ] Block conversion (`core/paragraph`, `core/image`, etc.) - [ ] Product/commerce migration +### Instagram +- [x] Profile discovery via GraphQL interception +- [x] Post extraction with full metadata (captions, dates, locations, hashtags) +- [x] Carousel slide extraction via `?img_index=N` +- [x] Gallery block output for carousel posts +- [x] Media download (photos and videos) +- [x] WordPress.com XML-RPC import with featured images +- [ ] Stories and Reels extraction +- [ ] Comment extraction + ### General - [x] WordPress.com REST API import script - [ ] WordPress Studio local-first workflow diff --git a/cli.js b/cli.js index 8d1d768..4dfb17b 100644 --- a/cli.js +++ b/cli.js @@ -267,6 +267,7 @@ function getBrowserUserAgent(browser) { function detectPlatform(url) { const lower = url.toLowerCase(); + if (lower.includes('instagram.com')) return 'instagram'; if (lower.includes('wix.com') || lower.includes('wixsite.com')) return 'wix'; if (lower.includes('squarespace.com')) return 'squarespace'; if (lower.includes('webflow.io') || lower.includes('webflow.com')) return 'webflow'; @@ -405,6 +406,7 @@ async function main() { heading('Login Detection'); const platforms = [ + { name: 'Instagram', domain: '.instagram.com' }, { name: 'Wix', domain: '.wix.com' }, { name: 'Squarespace', domain: '.squarespace.com' }, { name: 'Webflow', domain: '.webflow.com' }, @@ -455,12 +457,13 @@ async function main() { ok(`Detected platform: ${BOLD}${detectedPlatform}${RESET}`); } else { const platChoice = await askChoice('Which platform is this site on?', [ + { label: 'Instagram', value: 'instagram' }, { label: 'Wix', value: 'wix' }, { label: 'Squarespace', value: 'squarespace' }, { label: 'Webflow', value: 'webflow' }, { label: 'Other / not sure', value: 'unknown' }, ]); - // Use first choice as default if detection fails + detectedPlatform = platChoice.value; } const activePlatform = detectedPlatform !== 'unknown' ? detectedPlatform : 'wix'; @@ -521,13 +524,31 @@ async function main() { // ── Step 4: Run discovery ── heading('Step 1: Discovering Site Content'); - log(`Scanning ${siteUrl} for all pages, posts, and media...\n`); mkdirSync('output', { recursive: true }); - const uaArgs = userAgent ? ['--user-agent', userAgent] : []; + // Instagram uses the browser's own UA via CDP — user-agent flag is only for Wix/other platforms + const uaArgs = (userAgent && activePlatform !== 'instagram') ? ['--user-agent', userAgent] : []; const cdpArgs = cdpPort ? ['--cdp-port', String(cdpPort)] : []; - const discoverResult = await runScript(`scripts/${activePlatform}/discover.js`, [siteUrl, ...uaArgs, ...cdpArgs]); + + // Instagram uses a username, not a site URL + let discoverTarget = siteUrl; + if (activePlatform === 'instagram') { + // Extract username from URL or use as-is + const igMatch = siteUrl.match(/instagram\.com\/([^/?]+)/); + discoverTarget = igMatch ? igMatch[1] : siteUrl.replace(/^https?:\/\//, '').replace(/\/$/, ''); + log(`Discovering posts for @${discoverTarget}...\n`); + if (!cdpPort) { + fail('Instagram requires a CDP connection to an authenticated browser.'); + fail('Launch Chrome with: google-chrome --remote-debugging-port=9222'); + rl.close(); + return; + } + } else { + log(`Scanning ${siteUrl} for all pages, posts, and media...\n`); + } + + const discoverResult = await runScript(`scripts/${activePlatform}/discover.js`, [discoverTarget, ...uaArgs, ...cdpArgs]); if (discoverResult.code !== 0) { fail('Discovery failed. See output above.'); const retry = await ask('Try again? (y/n)'); @@ -558,10 +579,10 @@ async function main() { // ── Step 5: Extract content ── heading('Step 2: Extracting Content'); - log(`Extracting all pages, posts, and media from ${siteUrl}...\n`); + log(`Extracting all content from ${activePlatform === 'instagram' ? '@' + discoverTarget : siteUrl}...\n`); const extractResult = await runScript(`scripts/${activePlatform}/extract.js`, [ - siteUrl, + activePlatform === 'instagram' ? discoverTarget : siteUrl, '--url-list', 'output/inventory.json', ...uaArgs, ...cdpArgs diff --git a/prompts/instagram.md b/prompts/instagram.md new file mode 100644 index 0000000..404a7b7 --- /dev/null +++ b/prompts/instagram.md @@ -0,0 +1,77 @@ +# Instagram to WordPress.com Migration Prompt + +Copy everything below this line and paste it into your AI assistant (Claude, ChatGPT, Gemini, etc.). + +--- + +I want to migrate my Instagram photos and posts to WordPress.com. My Instagram username is: **[PASTE YOUR USERNAME HERE]** + +I have (or will create) a WordPress.com account. Please help me migrate using the playbook at https://github.com/Automattic/data-liberation-agent — read AGENTS.md first for full instructions. + +**Important**: Instagram requires an authenticated browser session. I'll need to have Chrome (or another Chromium browser) open and logged into Instagram before we start. + +Here's what I need you to do: + +## Step 1: Set up browser access + +Help me launch Chrome with remote debugging enabled so the migration scripts can connect: + +1. Quit Chrome completely +2. Relaunch with: `"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --remote-debugging-port=9222 --user-data-dir="$HOME/.data-liberation/cdp-profile/chrome" --restore-last-session` +3. Log into Instagram in the browser window that opens +4. Confirm the connection works + +## Step 2: Discover all my posts + +```bash +node scripts/instagram/discover.js MY_USERNAME --cdp-port 9222 +``` + +This connects to my browser, navigates to my profile, and intercepts Instagram's internal GraphQL API responses as it scrolls through all my posts. It captures: +- Post metadata (captions, dates, locations, hashtags, tagged users) +- Post types (photos, videos, carousels with slide counts) +- Image and video URLs +- Profile information + +Show me the inventory summary and wait for my approval before proceeding. + +**If it stalls or gets rate limited**: Add `--delay 3000` for a gentler pace. + +## Step 3: Extract full content and download media + +```bash +node scripts/instagram/extract.js MY_USERNAME --cdp-port 9222 +``` + +This visits each post individually to get: +- Full-resolution images (not thumbnails) +- All carousel slides (uses `?img_index=N` to access each slide directly) +- Video URLs +- Tagged users, location details, accessibility captions + +All media is downloaded locally — Instagram CDN URLs expire, so this must happen promptly after discovery. + +## Step 4: Import to WordPress.com + +```bash +node scripts/import.js --site my-wp-site.wordpress.com --token MY_APP_PASSWORD +``` + +This creates WordPress posts from the extracted data: +- Each Instagram post becomes a WordPress post (as draft) +- Images uploaded to the media library +- Captions become post content with @mentions and #hashtags linked +- Original post date preserved +- Instagram shortcode and URL stored as post meta + +**For a custom post type** (e.g. a "photo" CPT): add `--post-type photo` + +## Step 5: Verify + +When done: +- Show me how many posts were imported vs. discovered +- Flag any posts with missing images or import errors +- Check that carousel posts have all their slides +- List the date range covered (oldest → newest) + +Work methodically — do one step at a time, show me progress, and wait for my go-ahead before moving to the next step. diff --git a/scripts/instagram/discover.js b/scripts/instagram/discover.js new file mode 100644 index 0000000..0ca79c3 --- /dev/null +++ b/scripts/instagram/discover.js @@ -0,0 +1,487 @@ +#!/usr/bin/env node +/** + * discover.js — Step 1: Inventory an Instagram profile + * + * Connects to a browser where you're logged into Instagram, navigates + * to your profile, and intercepts the GraphQL API responses as you + * scroll to build a complete manifest of all posts. + * + * REQUIRES: a running browser with CDP enabled and an active Instagram session. + * Instagram's API requires authentication — there's no public sitemap to crawl. + * + * Usage: + * node scripts/instagram/discover.js --cdp-port 9222 + * node scripts/instagram/discover.js --cdp-port 9222 --limit 50 + * + * Options: + * --cdp-port CDP port of your running browser (required) + * --limit Stop after N posts (for testing) + * --delay Delay between scroll actions (default: 2000) + * + * Output: + * output/inventory.json — manifest of all discovered posts with metadata + */ + +import { chromium } from 'playwright'; +import { writeFileSync, mkdirSync } from 'fs'; + +const args = process.argv.slice(2); +const username = args.find(a => !a.startsWith('--')); +if (!username) { + console.error('Usage: node scripts/instagram/discover.js --cdp-port '); + process.exit(1); +} + +function parseIntArg(name, fallback) { + const idx = args.indexOf(name); + if (idx === -1) return fallback; + const val = parseInt(args[idx + 1], 10); + if (!Number.isFinite(val)) { + console.error(`Error: ${name} requires a numeric value.`); + process.exit(1); + } + return val; +} + +const cdpPort = parseIntArg('--cdp-port', null); +if (!cdpPort) { + console.error('Error: --cdp-port is required. Instagram needs an authenticated browser session.'); + console.error('Launch Chrome with: google-chrome --remote-debugging-port=9222'); + process.exit(1); +} + +const limit = parseIntArg('--limit', Infinity); +const scrollDelay = parseIntArg('--delay', 2000); + +mkdirSync('output', { recursive: true }); + +function sleep(ms) { + return new Promise(r => setTimeout(r, ms)); +} + +// Classify an Instagram post by its type +function classifyPost(node) { + if (node.__typename === 'GraphSidecar' || node.edge_sidecar_to_children) return 'carousel'; + if (node.__typename === 'GraphVideo' || node.is_video) return 'video'; + return 'photo'; +} + +// Extract post metadata from a GraphQL edge node +function extractPostMeta(node) { + const caption = node.edge_media_to_caption?.edges?.[0]?.node?.text || ''; + return { + id: node.id, + shortcode: node.shortcode, + type: classifyPost(node), + timestamp: node.taken_at_timestamp, + date: node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toISOString() : null, + caption, + displayUrl: node.display_url, + thumbnailUrl: node.thumbnail_src || node.thumbnail_resources?.[0]?.src, + dimensions: node.dimensions || null, + isVideo: !!node.is_video, + videoUrl: node.video_url || null, + accessibilityCaption: node.accessibility_caption || null, + locationName: node.location?.name || null, + locationId: node.location?.id || null, + likes: node.edge_media_preview_like?.count ?? node.edge_liked_by?.count ?? null, + comments: node.edge_media_to_comment?.count ?? node.edge_media_preview_comment?.count ?? null, + // For carousels, note how many slides + carouselCount: node.edge_sidecar_to_children?.edges?.length || null, + url: `https://www.instagram.com/p/${node.shortcode}/`, + }; +} + +async function main() { + console.log(`Discovering Instagram posts for: ${username}`); + console.log(`Connecting to browser on CDP port ${cdpPort}...`); + + const browser = await chromium.connectOverCDP(`http://127.0.0.1:${cdpPort}`); + const context = browser.contexts()[0] || await browser.newContext(); + const page = await context.newPage(); + + const posts = new Map(); // shortcode → post data (dedupes) + let profileData = null; + let hasMore = true; + let endCursor = null; + let paginationRequests = 0; + + // Intercept GraphQL responses to capture post data + page.on('response', async (response) => { + const url = response.url(); + if (!url.includes('/graphql/query') && !url.includes('/api/v1/')) return; + + const ct = response.headers()['content-type'] || ''; + if (!ct.includes('application/json') && !ct.includes('text/javascript')) return; + + try { + const body = await response.json(); + + // Profile page initial load — data is in entry_data or in the graphql response + const userData = body?.data?.user || + body?.graphql?.user || + body?.data?.xdt_api__v1__feed__user_timeline_graphql_connection; + + if (userData) { + // Extract profile info on first encounter + if (!profileData && (userData.username || userData.full_name)) { + profileData = { + id: userData.id || userData.pk || null, + username: userData.username || username, + fullName: userData.full_name || '', + biography: userData.biography || userData.bio_text || '', + profilePicUrl: userData.profile_pic_url_hd || userData.profile_pic_url || '', + postCount: userData.edge_owner_to_timeline_media?.count ?? + userData.media_count ?? null, + followerCount: userData.edge_followed_by?.count ?? + userData.follower_count ?? null, + followingCount: userData.edge_follow?.count ?? + userData.following_count ?? null, + isPrivate: userData.is_private || false, + isVerified: userData.is_verified || false, + }; + console.log(` Profile: ${profileData.fullName} (@${profileData.username})`); + if (profileData.postCount) { + console.log(` Total posts reported: ${profileData.postCount}`); + } + } + + // Extract posts from the timeline media edges + const timeline = userData.edge_owner_to_timeline_media || + userData.edge_web_feed_timeline; + + if (timeline?.edges) { + for (const edge of timeline.edges) { + const node = edge.node; + if (node?.shortcode && !posts.has(node.shortcode)) { + posts.set(node.shortcode, extractPostMeta(node)); + } + } + // Track pagination cursor + if (timeline.page_info) { + hasMore = timeline.page_info.has_next_page; + endCursor = timeline.page_info.end_cursor; + } + paginationRequests++; + console.log(` Captured ${posts.size} posts (page ${paginationRequests})...`); + } + + // Handle the newer API format (xdt_api__v1__feed) + if (userData.edges) { + for (const edge of userData.edges) { + const node = edge.node; + if (node?.code && !posts.has(node.code)) { + posts.set(node.code, { + id: node.pk || node.id, + shortcode: node.code, + type: node.carousel_media_count ? 'carousel' : node.video_versions ? 'video' : 'photo', + timestamp: node.taken_at, + date: node.taken_at ? new Date(node.taken_at * 1000).toISOString() : null, + caption: node.caption?.text || '', + displayUrl: node.image_versions2?.candidates?.[0]?.url || '', + thumbnailUrl: node.image_versions2?.candidates?.slice(-1)?.[0]?.url || '', + dimensions: node.original_width && node.original_height + ? { width: node.original_width, height: node.original_height } : null, + isVideo: !!node.video_versions, + videoUrl: node.video_versions?.[0]?.url || null, + accessibilityCaption: node.accessibility_caption || null, + locationName: node.location?.name || null, + locationId: node.location?.pk || null, + likes: node.like_count ?? null, + comments: node.comment_count ?? null, + carouselCount: node.carousel_media_count || null, + url: `https://www.instagram.com/p/${node.code}/`, + }); + } + } + if (userData.page_info) { + hasMore = userData.page_info.has_next_page; + endCursor = userData.page_info.end_cursor; + } + paginationRequests++; + console.log(` Captured ${posts.size} posts (page ${paginationRequests})...`); + } + } + } catch { + // Not all responses are JSON — that's fine + } + }); + + // Navigate to the profile + const profileUrl = `https://www.instagram.com/${username}/`; + console.log(`\nNavigating to ${profileUrl}`); + try { + await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); + // Wait for the profile content to appear (response interceptor captures GraphQL data) + await sleep(3000); + } catch (e) { + if (posts.size > 0) { + console.log(` Navigation timeout, but ${posts.size} posts already captured — continuing`); + } else { + console.error(` Navigation failed: ${e.message}`); + await browser.disconnect(); + process.exit(1); + } + } + + // Check if we're logged in by looking for login prompts + const loginPrompt = await page.$('input[name="username"]'); + if (loginPrompt) { + console.error('\nError: Not logged into Instagram in this browser session.'); + console.error('Please log in to Instagram in your browser first, then re-run.'); + await browser.close(); + process.exit(1); + } + + // Check for private profile + const privateMsg = await page.$('text=This account is private'); + if (privateMsg) { + console.warn('\nWarning: This is a private profile. You can only extract your own posts or accounts you follow.'); + } + + // Also try to capture data from the page's window globals + const windowData = await page.evaluate(() => { + const result = {}; + + // Check specific known globals (avoid enumerating all of window) + for (const key of ['__additionalDataLoaded', '__NEXT_DATA__', '_sharedData']) { + if (window[key]) { + try { result[key] = window[key]; } catch {} + } + } + + // Check for require'd modules (older Instagram) + try { + if (window._sharedData?.entry_data?.ProfilePage?.[0]?.graphql?.user) { + result.profilePageData = window._sharedData.entry_data.ProfilePage[0].graphql.user; + } + } catch {} + + return result; + }); + + // Extract any posts from window globals + const globalUser = windowData?.profilePageData; + if (globalUser?.edge_owner_to_timeline_media?.edges) { + for (const edge of globalUser.edge_owner_to_timeline_media.edges) { + const node = edge.node; + if (node?.shortcode && !posts.has(node.shortcode)) { + posts.set(node.shortcode, extractPostMeta(node)); + } + } + if (!profileData && globalUser.username) { + profileData = { + username: globalUser.username, + fullName: globalUser.full_name || '', + biography: globalUser.biography || '', + profilePicUrl: globalUser.profile_pic_url_hd || '', + postCount: globalUser.edge_owner_to_timeline_media?.count ?? null, + followerCount: globalUser.edge_followed_by?.count ?? null, + followingCount: globalUser.edge_follow?.count ?? null, + isPrivate: globalUser.is_private || false, + isVerified: globalUser.is_verified || false, + }; + } + console.log(` Extracted ${posts.size} posts from page data`); + } + + // Phase 1: Scroll to load a couple pages and capture the GraphQL patterns + console.log('\nScrolling to load more posts...'); + let scrollAttempts = 0; + let lastPostCount = posts.size; + let noNewPostsStreak = 0; + let currentDelay = scrollDelay; + + while (posts.size < limit && scrollAttempts < 100) { + const scrollStart = Date.now(); + await page.evaluate(() => window.scrollTo(0, document.documentElement.scrollHeight)); + await sleep(currentDelay); + scrollAttempts++; + + if (posts.size === lastPostCount) { + noNewPostsStreak++; + // Back off when we're not getting new posts — Instagram may be throttling + currentDelay = Math.min(currentDelay * 1.5, scrollDelay * 4); + if (noNewPostsStreak >= 4) break; + } else { + noNewPostsStreak = 0; + // New posts arrived — ease back toward base delay + currentDelay = Math.max(scrollDelay, currentDelay * 0.8); + lastPostCount = posts.size; + console.log(` Scroll ${scrollAttempts}: ${posts.size} posts`); + } + } + + // Phase 2: If we have a cursor and need more posts, use direct GraphQL requests + // executed in the page context (inherits cookies and CSRF tokens) + if (hasMore && endCursor && posts.size < limit) { + console.log(`\nUsing direct GraphQL pagination (cursor available)...`); + + // First, extract the user ID from what we've already captured + const userId = profileData?.id || await page.evaluate((uname) => { + // Try to find the user ID from page source or existing data + const bodyText = document.body.innerHTML; + const idMatch = bodyText.match(/"profilePage_(\d+)"/); + return idMatch ? idMatch[1] : null; + }, username); + + if (!userId) { + // Get user ID via the profile page's metadata + const userIdFromMeta = await page.evaluate(() => { + // Instagram embeds the user ID in various places + const scripts = document.querySelectorAll('script'); + for (const s of scripts) { + const match = s.textContent.match(/"user_id":"(\d+)"/); + if (match) return match[1]; + } + // Also check meta tags + const instagramUrl = document.querySelector('meta[property="al:android:url"]')?.content; + if (instagramUrl) { + const m = instagramUrl.match(/(\d+)/); + if (m) return m[1]; + } + return null; + }); + + if (userIdFromMeta) { + if (!profileData) profileData = {}; + profileData.id = userIdFromMeta; + } + } + + const resolvedUserId = profileData?.id || userId; + + if (resolvedUserId) { + console.log(` User ID: ${resolvedUserId}`); + let cursor = endCursor; + let apiPage = 0; + let apiErrors = 0; + + while (hasMore && cursor && posts.size < limit && apiErrors < 3) { + apiPage++; + try { + // Execute the GraphQL query from within the page context + // This inherits all cookies, CSRF tokens, and headers + const result = await page.evaluate(async ({ userId, after }) => { + // Instagram uses a specific query hash for user media pagination + // We'll try the documented endpoint first + const variables = JSON.stringify({ + id: userId, + first: 12, + after: after, + }); + + // Try the newer API endpoint first + const url = `https://www.instagram.com/graphql/query/?query_hash=472f257a40c653c64c666ce877d59d2b&variables=${encodeURIComponent(variables)}`; + + const res = await fetch(url, { + headers: { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': '*/*', + }, + credentials: 'include', + }); + + if (!res.ok) return { error: `HTTP ${res.status}` }; + const text = await res.text(); + try { + return JSON.parse(text); + } catch { + return { error: `Not JSON (starts with: ${text.slice(0, 50)})` }; + } + }, { userId: resolvedUserId, after: cursor }); + + if (result.error) { + console.log(` API error: ${result.error}`); + // Rate limited or challenged — wait and retry once + apiErrors++; + console.log(` Waiting 10s and retrying (${apiErrors}/3)...`); + await sleep(10000); + continue; + } + + const media = result?.data?.user?.edge_owner_to_timeline_media; + if (media?.edges) { + for (const edge of media.edges) { + const node = edge.node; + if (node?.shortcode && !posts.has(node.shortcode)) { + posts.set(node.shortcode, extractPostMeta(node)); + } + } + hasMore = media.page_info?.has_next_page ?? false; + cursor = media.page_info?.end_cursor ?? null; + console.log(` API page ${apiPage}: ${posts.size} total posts`); + } else { + // Response format might have changed — try to find data elsewhere + console.log(` Unexpected response format on page ${apiPage}`); + break; + } + + // Rate limit: be polite — API calls need more delay than scrolling + await sleep(Math.max(scrollDelay, 3000)); + } catch (e) { + console.log(` API pagination error: ${e.message}`); + break; + } + } + } else { + console.log(' Could not determine user ID — falling back to scroll only'); + // Continue scrolling as fallback + while (posts.size < limit && scrollAttempts < 200) { + await page.evaluate(() => window.scrollTo(0, document.documentElement.scrollHeight)); + await sleep(scrollDelay); + scrollAttempts++; + if (posts.size > lastPostCount) { + lastPostCount = posts.size; + noNewPostsStreak = 0; + if (scrollAttempts % 5 === 0) console.log(` Scroll ${scrollAttempts}: ${posts.size} posts`); + } else { + noNewPostsStreak++; + if (noNewPostsStreak >= 8) break; + } + } + } + } + + await page.close(); + await browser.disconnect(); + + // Build inventory + const allPosts = [...posts.values()].sort((a, b) => (b.timestamp || 0) - (a.timestamp || 0)); + + const counts = { photo: 0, video: 0, carousel: 0 }; + for (const post of allPosts) { + counts[post.type] = (counts[post.type] || 0) + 1; + } + + const inventory = { + platform: 'instagram', + username, + profile: profileData, + discoveredAt: new Date().toISOString(), + counts, + urls: allPosts.map(p => ({ + url: p.url, + type: p.type, + id: p.id, + shortcode: p.shortcode, + })), + posts: allPosts, + }; + + writeFileSync('output/inventory.json', JSON.stringify(inventory, null, 2)); + + console.log('\nInventory summary:'); + for (const [type, count] of Object.entries(counts)) { + console.log(` ${type}: ${count}`); + } + console.log(`\nTotal: ${allPosts.length} posts discovered`); + if (profileData?.postCount) { + const pct = Math.round((allPosts.length / profileData.postCount) * 100); + console.log(`Coverage: ${allPosts.length}/${profileData.postCount} (${pct}%)`); + } + console.log('Written to output/inventory.json'); + console.log('\nReview this inventory before running extract.js'); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/instagram/extract.js b/scripts/instagram/extract.js new file mode 100644 index 0000000..5a8e998 --- /dev/null +++ b/scripts/instagram/extract.js @@ -0,0 +1,487 @@ +#!/usr/bin/env node +/** + * extract.js — Step 2: Extract full content from Instagram posts + * + * Takes the inventory from discover.js and visits each post individually + * to get full-resolution images, carousel slides, video URLs, and comments. + * + * Usage: + * node scripts/instagram/extract.js --cdp-port 9222 + * node scripts/instagram/extract.js --cdp-port 9222 --limit 10 + * node scripts/instagram/extract.js --cdp-port 9222 --skip-media + * + * Options: + * --cdp-port CDP port of your running browser (required) + * --delay Delay between posts (default: 1500) + * --limit Only process first N posts (for testing) + * --skip-media Extract metadata only, don't download images/videos + * --url-list Use inventory from discover.js (default: output/inventory.json) + * + * Output: + * output/pages/.json — extracted post data + * output/media/ — downloaded images and videos + * output/extraction-log.json — summary of what was extracted + */ + +import { chromium } from 'playwright'; +import { writeFileSync, mkdirSync, readFileSync, existsSync, createWriteStream } from 'fs'; +import { basename } from 'path'; +import https from 'https'; +import http from 'http'; + +const args = process.argv.slice(2); +const username = args.find(a => !a.startsWith('--')); +if (!username) { + console.error('Usage: node scripts/instagram/extract.js --cdp-port '); + process.exit(1); +} + +function parseIntArg(name, fallback) { + const idx = args.indexOf(name); + if (idx === -1) return fallback; + const val = parseInt(args[idx + 1], 10); + if (!Number.isFinite(val)) { + console.error(`Error: ${name} requires a numeric value.`); + process.exit(1); + } + return val; +} + +const cdpPort = parseIntArg('--cdp-port', null); +if (!cdpPort) { + console.error('Error: --cdp-port is required.'); + process.exit(1); +} + +const delay = parseIntArg('--delay', 1500); +const limit = parseIntArg('--limit', Infinity); +const skipMedia = args.includes('--skip-media'); +const urlListArg = args.indexOf('--url-list'); +const urlListFile = urlListArg !== -1 ? args[urlListArg + 1] : 'output/inventory.json'; + +mkdirSync('output/pages', { recursive: true }); +mkdirSync('output/media', { recursive: true }); + +function sleep(ms) { + return new Promise(r => setTimeout(r, ms)); +} + +function downloadFile(url, destPath, redirectsLeft = 5) { + return new Promise((resolve, reject) => { + if (redirectsLeft <= 0) return reject(new Error('Too many redirects')); + if (!url.startsWith('https://')) return reject(new Error(`Refused non-HTTPS URL: ${url}`)); + + const file = createWriteStream(destPath); + file.on('error', reject); + + const req = https.get(url, { headers: { 'User-Agent': 'Mozilla/5.0' }, timeout: 30000 }, res => { + if ([301, 302, 303, 307, 308].includes(res.statusCode)) { + res.resume(); // consume response to free socket + file.close(); + const location = res.headers.location; + if (!location?.startsWith('https://')) return reject(new Error(`Redirect to non-HTTPS: ${location}`)); + return downloadFile(location, destPath, redirectsLeft - 1).then(resolve).catch(reject); + } + if (res.statusCode !== 200) { + res.resume(); + file.close(); + return reject(new Error(`HTTP ${res.statusCode}`)); + } + res.pipe(file); + file.on('finish', () => file.close(resolve)); + }); + req.on('error', reject); + req.on('timeout', () => { req.destroy(); reject(new Error('Download timeout')); }); + }); +} + +// Sanitize shortcode for safe use as filename +function safeShortcode(sc) { + return sc.replace(/[^a-zA-Z0-9_-]/g, ''); +} + +// Generate a sane filename from an Instagram CDN URL +function mediaFilename(url, shortcode, index) { + const ext = url.match(/\.(jpg|jpeg|png|webp|mp4|mov)/i)?.[1] || 'jpg'; + return `${safeShortcode(shortcode)}_${index}.${ext.toLowerCase()}`; +} + +async function extractPostData(page, shortcode, isCarousel = false, carouselCount = 10) { + const postUrl = `https://www.instagram.com/p/${shortcode}/`; + const captured = { apiCalls: [], globals: null }; + + // Intercept API responses for this post + const responseHandler = async (response) => { + const url = response.url(); + if (!url.includes('/graphql/query') && !url.includes('/api/v1/media/')) return; + + const ct = response.headers()['content-type'] || ''; + if (!ct.includes('application/json') && !ct.includes('text/javascript')) return; + + try { + const body = await response.json(); + captured.apiCalls.push({ url, data: body }); + } catch {} + }; + + page.on('response', responseHandler); + + try { + await page.goto(postUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); + // Wait for the main post image or GraphQL response to land + await page.waitForSelector('article img[src*="cdninstagram.com"]', { timeout: 10000 }).catch(() => {}); + } catch (e) { + console.error(` Navigation failed: ${e.message}`); + } + + page.off('response', responseHandler); + + // Extract data from the page — Instagram embeds post data in script tags and window objects + captured.globals = await page.evaluate(() => { + const result = {}; + + // JSON-LD (Instagram provides this for public posts) + result.jsonLd = Array.from( + document.querySelectorAll('script[type="application/ld+json"]') + ).map(s => { + try { return JSON.parse(s.textContent); } catch { return null; } + }).filter(Boolean); + + // Check specific known globals (avoid enumerating all of window) + for (const key of ['__additionalDataLoaded', '__NEXT_DATA__', '_sharedData']) { + if (window[key]) { + try { result[key] = window[key]; } catch {} + } + } + + // Meta tags — Instagram sets good OG tags + result.meta = { + title: document.title, + description: document.querySelector('meta[name="description"]')?.content, + ogTitle: document.querySelector('meta[property="og:title"]')?.content, + ogDescription: document.querySelector('meta[property="og:description"]')?.content, + ogImage: document.querySelector('meta[property="og:image"]')?.content, + ogType: document.querySelector('meta[property="og:type"]')?.content, + }; + + return result; + }); + + // Try to find the post's media from the captured API calls + let postDetail = null; + for (const call of captured.apiCalls) { + const data = call.data; + + // Look for the post in various response shapes + const media = data?.data?.xdt_shortcode_media || + data?.graphql?.shortcode_media || + data?.data?.shortcode_media || + data?.items?.[0]; + + if (media && (media.shortcode === shortcode || media.code === shortcode)) { + postDetail = media; + break; + } + } + + // For carousel posts: click through the carousel arrows to capture each slide's + // full-res image. Instagram's carousel arrows are button[aria-label="Next"] and + // are NOT inside the article element — they're in a parent container. + const carouselSlides = []; + if (isCarousel) { + // Strategy: Instagram supports ?img_index=N to load a specific carousel slide. + // Navigate to each slide directly and grab the main post image. + const expectedSlides = carouselCount || 10; + + // Strategy: navigate to each ?img_index=N, collect ALL
  • images + // from every page load, then deduplicate by Instagram media ID + // (the numeric prefix in the CDN URL). Instagram keeps 3
  • elements + // in the DOM (previous, current, next), so we see overlap between + // adjacent slides — deduplication handles this cleanly. + const seenMediaIds = new Set(); + + const getAllSlideImages = async () => { + return page.evaluate(() => { + const results = []; + for (const img of document.querySelectorAll('li img')) { + const src = img.src || ''; + if (!src.includes('cdninstagram.com/v/t51.')) continue; + if (img.alt?.includes('User avatar')) continue; + const rect = img.getBoundingClientRect(); + if (rect.width < 300) continue; + // Extract the Instagram media ID (unique per photo) + const idMatch = src.match(/\/(\d{5,})_/); + results.push({ src, mediaId: idMatch?.[1] || null }); + } + // Also check for video + const video = document.querySelector('li video[src], li video source, video[src], video source'); + const videoUrl = video?.src || video?.querySelector?.('source')?.src || null; + if (videoUrl) results.push({ src: null, videoUrl, mediaId: 'video_' + Date.now() }); + return results; + }); + }; + + for (let slideIdx = 1; slideIdx <= expectedSlides; slideIdx++) { + try { + const slideUrl = `https://www.instagram.com/p/${shortcode}/?img_index=${slideIdx}`; + await page.goto(slideUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }).catch(() => {}); + await page.waitForSelector('li img[src*="cdninstagram.com"]', { timeout: 8000 }).catch(() => {}); + await sleep(800); + + const images = await getAllSlideImages(); + for (const img of images) { + if (!img.mediaId || seenMediaIds.has(img.mediaId)) continue; + seenMediaIds.add(img.mediaId); + carouselSlides.push({ + type: img.videoUrl ? 'video' : 'photo', + displayUrl: img.src || null, + videoUrl: img.videoUrl || null, + }); + } + } catch { + break; + } + } + + if (carouselSlides.length > 1) { + console.log(` Carousel: ${carouselSlides.length} slides captured`); + } + } + + return { captured, postDetail, carouselSlides }; +} + +function buildPostOutput(shortcode, inventoryPost, postDetail, globals, carouselSlides = []) { + const output = { + shortcode, + sourceUrl: `https://www.instagram.com/p/${shortcode}/`, + extractedAt: new Date().toISOString(), + // Start with inventory data as baseline + ...inventoryPost, + // Enrich with detail data if we got it + media: [], + tags: [], + mentionedUsers: [], + }; + + if (postDetail) { + // Full caption (inventory might have truncated it) + output.caption = postDetail.edge_media_to_caption?.edges?.[0]?.node?.text || + postDetail.caption?.text || + output.caption; + + // Location details + if (postDetail.location) { + output.location = { + name: postDetail.location.name, + id: postDetail.location.id || postDetail.location.pk, + slug: postDetail.location.slug || null, + lat: postDetail.location.lat || null, + lng: postDetail.location.lng || null, + address: (() => { + try { return postDetail.location.address_json ? JSON.parse(postDetail.location.address_json) : null; } + catch { return null; } + })(), + }; + } + + // Tagged users + const taggedEdges = postDetail.edge_media_to_tagged_user?.edges || []; + output.mentionedUsers = taggedEdges.map(e => ({ + username: e.node?.user?.username, + fullName: e.node?.user?.full_name, + x: e.node?.x, + y: e.node?.y, + })); + + // Extract all media items (handles single posts, carousels, and videos) + if (postDetail.edge_sidecar_to_children?.edges) { + // Carousel post + for (const edge of postDetail.edge_sidecar_to_children.edges) { + const child = edge.node; + output.media.push({ + type: child.is_video ? 'video' : 'photo', + displayUrl: child.display_url || child.display_resources?.slice(-1)?.[0]?.src, + videoUrl: child.video_url || null, + dimensions: child.dimensions || null, + accessibilityCaption: child.accessibility_caption || null, + }); + } + } else if (postDetail.carousel_media) { + // Newer API format for carousels + for (const child of postDetail.carousel_media) { + output.media.push({ + type: child.video_versions ? 'video' : 'photo', + displayUrl: child.image_versions2?.candidates?.[0]?.url, + videoUrl: child.video_versions?.[0]?.url || null, + dimensions: child.original_width && child.original_height + ? { width: child.original_width, height: child.original_height } : null, + accessibilityCaption: child.accessibility_caption || null, + }); + } + } else { + // Single photo or video + output.media.push({ + type: postDetail.is_video ? 'video' : 'photo', + displayUrl: postDetail.display_url || + postDetail.image_versions2?.candidates?.[0]?.url, + videoUrl: postDetail.video_url || + postDetail.video_versions?.[0]?.url || null, + dimensions: postDetail.dimensions || + (postDetail.original_width && postDetail.original_height + ? { width: postDetail.original_width, height: postDetail.original_height } : null), + accessibilityCaption: postDetail.accessibility_caption || null, + }); + } + } else { + // No detail data — fall back to what discover gave us + if (inventoryPost?.displayUrl) { + output.media.push({ + type: inventoryPost.isVideo ? 'video' : 'photo', + displayUrl: inventoryPost.displayUrl, + videoUrl: inventoryPost.videoUrl || null, + dimensions: inventoryPost.dimensions || null, + accessibilityCaption: inventoryPost.accessibilityCaption || null, + }); + } + } + + // For carousels: if we only got 1 media item from the API but we have + // carousel slides from clicking through the UI, use those instead + if (carouselSlides.length > 1 && output.media.length <= 1) { + output.media = carouselSlides.map(slide => ({ + ...slide, + dimensions: null, + accessibilityCaption: null, + })); + } + + // Extract hashtags and @mentions from caption + if (output.caption) { + output.tags = [...output.caption.matchAll(/#(\w+)/g)].map(m => m[1]); + const mentions = [...output.caption.matchAll(/@(\w+)/g)].map(m => m[1]); + // Merge with tagged users + for (const mention of mentions) { + if (!output.mentionedUsers.find(u => u.username === mention)) { + output.mentionedUsers.push({ username: mention }); + } + } + } + + // Add OG image as fallback if we have no media + if (output.media.length === 0 && globals?.meta?.ogImage) { + output.media.push({ + type: 'photo', + displayUrl: globals.meta.ogImage, + videoUrl: null, + dimensions: null, + accessibilityCaption: null, + }); + } + + return output; +} + +async function main() { + console.log(`Extracting Instagram posts for: ${username}`); + + // Load inventory + if (!existsSync(urlListFile)) { + console.error(`Inventory file not found: ${urlListFile}`); + console.error('Run discover.js first.'); + process.exit(1); + } + + const inventory = JSON.parse(readFileSync(urlListFile, 'utf8')); + const postsToProcess = (inventory.posts || []).slice(0, limit); + console.log(`Processing ${postsToProcess.length} posts from inventory...\n`); + + // Connect to browser + console.log(`Connecting to browser on CDP port ${cdpPort}...`); + const browser = await chromium.connectOverCDP(`http://127.0.0.1:${cdpPort}`); + const context = browser.contexts()[0] || await browser.newContext(); + const page = await context.newPage(); + console.log('Connected.\n'); + + const log = { processed: [], failed: [], mediaDownloaded: [] }; + const allMediaUrls = []; // { url, filename } pairs + + for (let i = 0; i < postsToProcess.length; i++) { + const inventoryPost = postsToProcess[i]; + const shortcode = inventoryPost.shortcode; + console.log(`[${i + 1}/${postsToProcess.length}] ${inventoryPost.url || shortcode}`); + + try { + const isCarousel = inventoryPost.type === 'carousel'; + const { captured, postDetail, carouselSlides } = await extractPostData( + page, shortcode, isCarousel, inventoryPost.carouselCount + ); + const output = buildPostOutput(shortcode, inventoryPost, postDetail, captured.globals, carouselSlides); + const safeCode = safeShortcode(shortcode); + + // Queue media for download and set local file paths before writing + for (let j = 0; j < output.media.length; j++) { + const item = output.media[j]; + const downloadUrl = item.videoUrl || item.displayUrl; + if (downloadUrl) { + const filename = mediaFilename(downloadUrl, shortcode, j); + allMediaUrls.push({ url: downloadUrl, filename }); + item.localFile = `output/media/${filename}`; + } + } + + writeFileSync(`output/pages/${safeCode}.json`, JSON.stringify(output, null, 2)); + + log.processed.push({ url: output.sourceUrl, shortcode }); + console.log(` Media items: ${output.media.length}, Tags: ${output.tags.length}`); + } catch (e) { + console.error(` FAILED: ${e.message}`); + log.failed.push({ shortcode, error: e.message }); + } + + if (i < postsToProcess.length - 1) await sleep(delay); + } + + await page.close(); + // Disconnect from the CDP session without closing the user's browser + if (typeof browser.disconnect === 'function') { + await browser.disconnect(); + } + + // Download all media (parallel with concurrency limit) + if (!skipMedia && allMediaUrls.length > 0) { + const CONCURRENCY = 8; + console.log(`\nDownloading ${allMediaUrls.length} media files (${CONCURRENCY} concurrent)...`); + let idx = 0; + + async function downloadWorker() { + while (idx < allMediaUrls.length) { + const i = idx++; + const { url, filename } = allMediaUrls[i]; + const dest = `output/media/${filename}`; + try { + await downloadFile(url, dest); + log.mediaDownloaded.push({ url, file: dest }); + process.stdout.write('.'); + } catch (e) { + log.failed.push({ url, error: `Media download: ${e.message}` }); + process.stdout.write('x'); + } + } + } + + await Promise.all(Array.from({ length: CONCURRENCY }, () => downloadWorker())); + console.log(''); + } else if (skipMedia) { + console.log('\nSkipping media download (--skip-media)'); + } + + writeFileSync('output/extraction-log.json', JSON.stringify(log, null, 2)); + console.log(`\nDone.`); + console.log(` Posts extracted: ${log.processed.length}`); + console.log(` Media downloaded: ${log.mediaDownloaded.length}`); + console.log(` Failures: ${log.failed.length}`); + if (log.failed.length) console.log(' See output/extraction-log.json for details'); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/instagram/import.js b/scripts/instagram/import.js new file mode 100644 index 0000000..4e5e295 --- /dev/null +++ b/scripts/instagram/import.js @@ -0,0 +1,496 @@ +#!/usr/bin/env node +/** + * import.js — Step 3: Import Instagram content to WordPress.com + * + * Reads output/ from extract.js and publishes to WordPress.com via XML-RPC. + * Import order: media → posts + * + * Uses XML-RPC (wp.uploadFile, wp.newPost) because WordPress.com's REST API + * does not support write operations with application passwords. + * + * Usage: + * node scripts/import.js --site mysite.wordpress.com --user your-wpcom-user --token APP_PASSWORD + * node scripts/import.js --site mysite.wordpress.com --user your-wpcom-user --token APP_PASSWORD --dry-run + * + * Options: + * --site WordPress.com site domain (e.g. mysite.wordpress.com) + * --user WordPress.com username that owns the application password + * --token Application password from wordpress.com/me/security/application-passwords + * --dry-run Show what would be imported without actually doing it + * --only Only import 'media', 'pages', or 'posts' + * + * Getting your application password: + * 1. Go to wordpress.com/me/security/application-passwords + * 2. Create a new application password + * 3. Copy the password and pass it as --token + */ + +import { readFileSync, readdirSync, existsSync } from 'fs'; +import { basename } from 'path'; + +const args = process.argv.slice(2); +function getArg(name) { + const i = args.indexOf(name); + return i !== -1 ? args[i + 1] : null; +} + +const site = getArg('--site'); +const token = getArg('--token'); +const user = getArg('--user'); +const dryRun = args.includes('--dry-run'); +const only = getArg('--only'); +const postType = getArg('--post-type'); // e.g. 'photo' for a custom post type + +if (!site || !token || !user) { + console.error('Usage: node scripts/import.js --site --user --token '); + console.error(' Get your app password at: wordpress.com/me/security/application-passwords'); + process.exit(1); +} + +const xmlRpcUrl = `https://${site}/xmlrpc.php`; +const restApiBase = `https://public-api.wordpress.com/rest/v1.1/sites/${site}`; + +// ─── XML-RPC helpers ──────────────────────────────────────── + +function escapeXml(value) { + return String(value) + .replaceAll('&', '&') + .replaceAll('<', '<') + .replaceAll('>', '>') + .replaceAll('"', '"') + .replaceAll("'", '''); +} + +function xmlValue(value) { + if (value == null) return ''; + if (Buffer.isBuffer(value)) return `${value.toString('base64')}`; + if (value instanceof Date) { + // XML-RPC dateTime.iso8601 must NOT include timezone suffix + const iso = value.toISOString().replace(/[-:]/g, '').replace(/\.\d{3}Z$/, ''); + return `${iso}`; + } + if (typeof value === 'boolean') return `${value ? 1 : 0}`; + if (typeof value === 'number') return Number.isInteger(value) ? `${value}` : `${value}`; + if (Array.isArray(value)) { + return `${value.map(item => `${xmlValue(item)}`).join('')}`; + } + if (typeof value === 'object') { + return `${Object.entries(value) + .filter(([, item]) => item !== undefined) + .map(([key, item]) => `${escapeXml(key)}${xmlValue(item)}`) + .join('')}`; + } + return `${escapeXml(value)}`; +} + +function parseXmlRpcResponse(xml) { + const faultMatch = xml.match(/[\s\S]*?faultString<\/name>\s*([\s\S]*?)<\/string><\/value>[\s\S]*?<\/fault>/); + if (faultMatch) throw new Error(faultMatch[1]); + + const struct = {}; + const namedStringRegex = /([^<]+)<\/name>\s*([\s\S]*?)<\/string><\/value>/g; + const namedIntRegex = /([^<]+)<\/name>\s*<(?:int|i4)>([\s\S]*?)<\/(?:int|i4)><\/value>/g; + let member; + while ((member = namedStringRegex.exec(xml))) struct[member[1]] = member[2]; + while ((member = namedIntRegex.exec(xml))) struct[member[1]] = Number(member[2]); + if (Object.keys(struct).length) return struct; + + const stringMatch = xml.match(/([\s\S]*?)<\/string>/); + if (stringMatch) return stringMatch[1]; + const intMatch = xml.match(/<(?:int|i4)>([\s\S]*?)<\/(?:int|i4)>/); + if (intMatch) return Number(intMatch[1]); + + return null; +} + +async function xmlRpcCall(methodName, params) { + const body = `${methodName}${params + .map(param => `${xmlValue(param)}`) + .join('')}`; + + const res = await fetch(xmlRpcUrl, { + method: 'POST', + headers: { 'Content-Type': 'text/xml' }, + body, + }); + const text = await res.text(); + if (!res.ok) throw new Error(`${methodName} → ${res.status}: ${text}`); + return parseXmlRpcResponse(text); +} + +let cachedBlogId = null; +async function getBlogId() { + if (cachedBlogId) return cachedBlogId; + const res = await fetch(restApiBase); + const data = await res.json().catch(() => ({})); + if (!data.ID) throw new Error(`Could not determine site ID for ${site}`); + cachedBlogId = data.ID; + return cachedBlogId; +} + +function guessMimeType(filename) { + const ext = filename.split('.').pop().toLowerCase(); + const types = { jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png', gif: 'image/gif', webp: 'image/webp', mp4: 'video/mp4', mov: 'video/quicktime' }; + return types[ext] || 'application/octet-stream'; +} + +// Extract clean text content from accessibility tree nodes +function buildContentFromAccessibility(nodes) { + if (!nodes?.length) return ''; + const blocks = []; + for (const node of nodes) { + if (!node.name) continue; + if (node.role === 'heading') { + // Guess heading level from name length (crude but works as fallback) + blocks.push(`

    ${node.name}

    `); + } else if (['paragraph', 'StaticText', 'article', 'section'].includes(node.role)) { + blocks.push(`

    ${node.name}

    `); + } else if (node.role === 'img' && node.description) { + blocks.push(``); + } + } + return blocks.join('\n'); +} + +// Extract the best available content from a page JSON file +function extractContent(pageData) { + // Instagram posts — caption is the content, images are media attachments + if (pageData.platform === 'instagram' || pageData.shortcode) { + return buildInstagramContent(pageData); + } + + // Priority: Wix blog API response > JSON-LD > accessibility tree + for (const call of pageData.apiCalls || []) { + // Blog post body is typically in post.content or post.richContent + const body = call.data?.post?.content?.plainText || + call.data?.post?.richContent || + call.data?.content?.plainText; + if (body) return typeof body === 'string' ? `

    ${body}

    ` : JSON.stringify(body); + } + + // JSON-LD article body + const article = pageData.globals?.jsonLd?.find(j => j['@type'] === 'Article' || j['@type'] === 'BlogPosting'); + if (article?.articleBody) return `

    ${article.articleBody}

    `; + + // Fallback to accessibility tree + return buildContentFromAccessibility(pageData.accessibility); +} + +// Build WordPress block content from an Instagram post +function buildInstagramContent(pageData) { + const blocks = []; + const media = pageData.media || []; + const imageMedia = media.filter(m => m.type !== 'video' || !m.videoUrl); + const videoMedia = media.filter(m => m.type === 'video' && m.videoUrl); + + // Use a gallery block for carousels (multiple images), single image block otherwise + if (imageMedia.length > 1) { + const galleryImages = imageMedia.map(item => { + const src = item.localFile || item.displayUrl; + if (!src) return ''; + const alt = item.accessibilityCaption || ''; + return `\n
    ${alt.replace(/
    \n`; + }).filter(Boolean); + blocks.push(`\n\n`); + } else if (imageMedia.length === 1) { + const item = imageMedia[0]; + const src = item.localFile || item.displayUrl; + if (src) { + const alt = item.accessibilityCaption || pageData.caption?.slice(0, 125) || ''; + blocks.push(`\n
    ${alt.replace(/
    \n`); + } + } + + // Videos as separate blocks (can't go in gallery) + for (const item of videoMedia) { + blocks.push(`\n
    \n`); + } + + // Caption as a paragraph + if (pageData.caption) { + // Convert @mentions and #hashtags to links + let caption = pageData.caption + .replace(/@(\w+)/g, '@$1') + .replace(/#(\w+)/g, '#$1'); + blocks.push(`\n

    ${caption}

    \n`); + } + + // Link to original Instagram post + if (pageData.shortcode) { + const igUrl = `https://www.instagram.com/p/${pageData.shortcode}/`; + blocks.push(`\n

    Originally posted on Instagram

    \n`); + } + + return blocks.join('\n\n'); +} + +function extractMeta(pageData) { + // Instagram posts + if (pageData.platform === 'instagram' || pageData.shortcode) { + const caption = pageData.caption || ''; + // Title: first line of caption, or first 60 chars, or shortcode + const title = caption.split('\n')[0]?.slice(0, 80) || `Instagram ${pageData.shortcode}`; + return { + title, + description: caption.slice(0, 300), + featuredImageUrl: pageData.media?.[0]?.displayUrl || null, + publishDate: pageData.date || null, + modifiedDate: null, + slug: `ig-${pageData.shortcode}`, + }; + } + + const meta = pageData.globals?.meta || {}; + const jsonLd = pageData.globals?.jsonLd || []; + const article = jsonLd.find(j => ['Article', 'BlogPosting', 'WebPage'].includes(j['@type'])); + + return { + title: meta.ogTitle || meta.title || article?.headline || pageData.slug, + description: meta.description || meta.ogDescription || article?.description || '', + featuredImageUrl: meta.ogImage || article?.image?.url || null, + publishDate: article?.datePublished || null, + modifiedDate: article?.dateModified || null, + slug: pageData.slug, + }; +} + +async function uploadMedia(filePath, filename) { + if (dryRun) { + console.log(` [dry-run] Would upload: ${filename}`); + return { id: 0, source_url: `https://example.com/wp-content/uploads/${filename}` }; + } + + const fileBuffer = readFileSync(filePath); + const blogId = await getBlogId(); + const result = await xmlRpcCall('wp.uploadFile', [ + blogId, + user, + token, + { + name: filename, + type: guessMimeType(filename), + bits: fileBuffer, + overwrite: true, + }, + ]); + + return { + id: result.id || 0, + source_url: result.url, + }; +} + +async function importMedia() { + if (!existsSync('output/media')) { console.log('No media folder found.'); return {}; } + + const files = readdirSync('output/media'); + console.log(`\nUploading ${files.length} media files...`); + + const mediaMap = {}; // original filename → { url, id } + for (const file of files) { + process.stdout.write(` ${file}... `); + try { + const result = await uploadMedia(`output/media/${file}`, file); + mediaMap[file] = { url: result.source_url, id: result.id }; + console.log(`✓ ${result.source_url}`); + } catch (e) { + console.log(`✗ ${e.message}`); + } + } + return mediaMap; +} + +async function importPage(pageData, mediaMap) { + const meta = extractMeta(pageData); + let content = extractContent(pageData); + + for (const [filename, media] of Object.entries(mediaMap)) { + const url = typeof media === 'string' ? media : media.url; + content = content.replaceAll(filename, url); + } + + if (dryRun) { + console.log(` [dry-run] Would create page: ${meta.title} (${meta.slug})`); + return { id: 0, link: '#' }; + } + + const blogId = await getBlogId(); + const id = await xmlRpcCall('wp.newPost', [ + blogId, user, token, + { + post_type: 'page', + post_status: 'draft', + post_title: meta.title, + post_content: content, + post_excerpt: meta.description, + wp_slug: meta.slug, + }, + ]); + + return { id, link: `https://${site}/wp-admin/post.php?post=${id}&action=edit` }; +} + +async function importPost(pageData, mediaMap) { + const meta = extractMeta(pageData); + let content = extractContent(pageData); + + // Replace local file paths with uploaded WordPress URLs + for (const [filename, media] of Object.entries(mediaMap)) { + const url = typeof media === 'string' ? media : media.url; + content = content.replaceAll(filename, url); + } + + if (dryRun) { + console.log(` [dry-run] Would create post: ${meta.title} (${meta.slug})`); + return { id: 0, link: '#' }; + } + + const blogId = await getBlogId(); + + // Format date as "YYYY-MM-DD HH:MM:SS" string — WordPress.com ignores + // dateTime.iso8601 typed values but parses string dates correctly + let postDate; + if (meta.publishDate) { + const d = new Date(meta.publishDate); + postDate = `${d.getUTCFullYear()}-${String(d.getUTCMonth()+1).padStart(2,'0')}-${String(d.getUTCDate()).padStart(2,'0')} ${String(d.getUTCHours()).padStart(2,'0')}:${String(d.getUTCMinutes()).padStart(2,'0')}:${String(d.getUTCSeconds()).padStart(2,'0')}`; + } + + // Find featured image: first media item's WordPress media ID + let featuredImageId; + if (pageData.media?.[0]?.localFile) { + const firstMediaFile = basename(pageData.media[0].localFile); + const mediaEntry = mediaMap[firstMediaFile]; + if (mediaEntry?.id) featuredImageId = mediaEntry.id; + } + + const postData = { + post_type: postType || 'post', + post_status: 'publish', + post_title: meta.title, + post_content: content, + post_excerpt: meta.description, + wp_slug: meta.slug, + post_date: postDate, + post_thumbnail: featuredImageId || undefined, + }; + + const id = await xmlRpcCall('wp.newPost', [blogId, user, token, postData]); + + return { id, link: `https://${site}/wp-admin/post.php?post=${id}&action=edit` }; +} + +async function main() { + if (dryRun) console.log('[DRY RUN — no changes will be made]\n'); + + if (!existsSync('output/pages')) { + console.error('No output/pages directory found. Run extract.js first.'); + process.exit(1); + } + + const pageFiles = readdirSync('output/pages').filter(f => f.endsWith('.json')); + console.log(`Found ${pageFiles.length} extracted pages`); + + // Detect if this is an Instagram import + let isInstagram = false; + if (existsSync('output/inventory.json')) { + const inventory = JSON.parse(readFileSync('output/inventory.json', 'utf8')); + isInstagram = inventory.platform === 'instagram'; + } + + // Determine content type from inventory if available + let typeMap = {}; + if (existsSync('output/inventory.json')) { + const inventory = JSON.parse(readFileSync('output/inventory.json', 'utf8')); + for (const item of inventory.urls) { + if (isInstagram) { + // Instagram uses shortcodes as filenames + typeMap[item.shortcode] = item.type || 'photo'; + } else { + const slug = new URL(item.url).pathname.replace(/^\//, '').replace(/\//g, '--') || 'homepage'; + typeMap[slug] = item.type; + } + } + } + + const urlMap = []; // old URL → new WP URL, for redirect map + + // Step 1: Upload media + let mediaMap = {}; + if (!only || only === 'media') { + mediaMap = await importMedia(); + } + + // Instagram: all items are posts (not pages) + if (isInstagram) { + console.log(`\nImporting ${pageFiles.length} Instagram posts${postType ? ` as "${postType}"` : ''}...`); + for (const file of pageFiles) { + const pageData = JSON.parse(readFileSync(`output/pages/${file}`, 'utf8')); + const shortcode = file.replace('.json', ''); + process.stdout.write(` ${shortcode}... `); + try { + const result = await importPost(pageData, mediaMap); + console.log(`✓ ${result.link}`); + if (pageData.sourceUrl) urlMap.push({ old: pageData.sourceUrl, new: result.link }); + } catch (e) { + console.log(`✗ ${e.message}`); + } + } + } else { + // Step 2: Import pages + const pages = pageFiles.filter(f => { + const slug = f.replace('.json', ''); + return !typeMap[slug] || typeMap[slug] === 'page' || typeMap[slug] === 'homepage'; + }); + + if (!only || only === 'pages') { + console.log(`\nImporting ${pages.length} pages...`); + for (const file of pages) { + const pageData = JSON.parse(readFileSync(`output/pages/${file}`, 'utf8')); + const slug = file.replace('.json', ''); + process.stdout.write(` ${slug}... `); + try { + const result = await importPage(pageData, mediaMap); + console.log(`✓ ${result.link}`); + if (pageData.sourceUrl) urlMap.push({ old: pageData.sourceUrl, new: result.link }); + } catch (e) { + console.log(`✗ ${e.message}`); + } + } + } + + // Step 3: Import posts + const posts = pageFiles.filter(f => { + const slug = f.replace('.json', ''); + return typeMap[slug] === 'blog-post'; + }); + + if (!only || only === 'posts') { + console.log(`\nImporting ${posts.length} blog posts...`); + for (const file of posts) { + const pageData = JSON.parse(readFileSync(`output/pages/${file}`, 'utf8')); + const slug = file.replace('.json', ''); + process.stdout.write(` ${slug}... `); + try { + const result = await importPost(pageData, mediaMap); + console.log(`✓ ${result.link}`); + if (pageData.sourceUrl) urlMap.push({ old: pageData.sourceUrl, new: result.link }); + } catch (e) { + console.log(`✗ ${e.message}`); + } + } + } + } + + // Output redirect map + if (urlMap.length) { + const { writeFileSync } = await import('fs'); + writeFileSync('output/redirect-map.json', JSON.stringify(urlMap, null, 2)); + console.log(`\nRedirect map written to output/redirect-map.json`); + console.log('Use this to set up 301 redirects from your old Wix URLs to WordPress.'); + } + + console.log('\nImport complete. All content created as drafts — review in WordPress admin before publishing.'); + console.log(`https://${site}/wp-admin/`); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/tests/instagram.test.js b/tests/instagram.test.js new file mode 100644 index 0000000..8a911cd --- /dev/null +++ b/tests/instagram.test.js @@ -0,0 +1,444 @@ +#!/usr/bin/env node +/** + * tests/instagram.test.js — Unit tests for Instagram extraction and import + * + * Verifies data transformation logic without requiring a live Instagram + * session or WordPress site. Uses Node's built-in test runner (Node 18+). + * + * Usage: + * node --test tests/instagram.test.js + */ + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; + +// ─── Test fixtures ────────────────────────────────────────── + +const singlePhotoPost = { + shortcode: 'ABC123', + type: 'photo', + date: '2023-06-15T14:30:00.000Z', + caption: 'Hello @world! #sunset #photography', + media: [ + { + type: 'photo', + displayUrl: 'https://scontent.cdninstagram.com/v/t51.29350-15/12345_67890.jpg', + localFile: 'output/media/ABC123_0.jpg', + accessibilityCaption: 'A sunset over the ocean', + }, + ], + locationName: 'Santa Monica', + carouselCount: null, +}; + +const carouselPost = { + shortcode: 'XYZ789', + type: 'carousel', + date: '2024-12-25T16:00:00.000Z', + caption: 'Holiday vibes! @santa #christmas', + carouselCount: 3, + media: [ + { + type: 'photo', + displayUrl: 'https://scontent.cdninstagram.com/v/t51.29350-15/111_222.jpg', + localFile: 'output/media/XYZ789_0.jpg', + accessibilityCaption: 'First slide', + }, + { + type: 'photo', + displayUrl: 'https://scontent.cdninstagram.com/v/t51.82787-15/333_444.jpg', + localFile: 'output/media/XYZ789_1.jpg', + accessibilityCaption: 'Second slide', + }, + { + type: 'photo', + displayUrl: 'https://scontent.cdninstagram.com/v/t51.82787-15/555_666.jpg', + localFile: 'output/media/XYZ789_2.jpg', + accessibilityCaption: 'Third slide', + }, + ], +}; + +const videoPost = { + shortcode: 'VID456', + type: 'video', + date: '2022-01-10T08:00:00.000Z', + caption: 'Check this out', + media: [ + { + type: 'video', + displayUrl: null, + videoUrl: 'https://scontent.cdninstagram.com/v/t50/video.mp4', + localFile: 'output/media/VID456_0.mp4', + }, + ], +}; + +const noCaptionPost = { + shortcode: 'NOCAP', + type: 'photo', + date: '2021-05-01T12:00:00.000Z', + caption: '', + media: [ + { + type: 'photo', + displayUrl: 'https://scontent.cdninstagram.com/v/t51.29350-15/999_888.jpg', + localFile: 'output/media/NOCAP_0.jpg', + }, + ], +}; + +// ─── Import the functions under test ──────────────────────── + +// We can't directly import from import.js since it has side effects +// (arg parsing, process.exit). Instead, we replicate the pure functions +// here and test them. In a real setup, these would be exported. + +function extractMeta(pageData) { + if (pageData.platform === 'instagram' || pageData.shortcode) { + const caption = pageData.caption || ''; + const title = caption.split('\n')[0]?.slice(0, 80) || `Instagram ${pageData.shortcode}`; + return { + title, + description: caption.slice(0, 300), + featuredImageUrl: pageData.media?.[0]?.displayUrl || null, + publishDate: pageData.date || null, + modifiedDate: null, + slug: `ig-${pageData.shortcode}`, + }; + } + return null; +} + +function buildInstagramContent(pageData) { + const blocks = []; + const media = pageData.media || []; + const imageMedia = media.filter(m => m.type !== 'video' || !m.videoUrl); + const videoMedia = media.filter(m => m.type === 'video' && m.videoUrl); + + if (imageMedia.length > 1) { + const galleryImages = imageMedia.map(item => { + const src = item.localFile || item.displayUrl; + if (!src) return ''; + const alt = item.accessibilityCaption || ''; + return `\n
    ${alt.replace(/
    \n`; + }).filter(Boolean); + blocks.push(`\n\n`); + } else if (imageMedia.length === 1) { + const item = imageMedia[0]; + const src = item.localFile || item.displayUrl; + if (src) { + const alt = item.accessibilityCaption || pageData.caption?.slice(0, 125) || ''; + blocks.push(`\n
    ${alt.replace(/
    \n`); + } + } + + for (const item of videoMedia) { + blocks.push(`\n
    \n`); + } + + if (pageData.caption) { + let caption = pageData.caption + .replace(/@(\w+)/g, '@$1') + .replace(/#(\w+)/g, '#$1'); + blocks.push(`\n

    ${caption}

    \n`); + } + + if (pageData.shortcode) { + const igUrl = `https://www.instagram.com/p/${pageData.shortcode}/`; + blocks.push(`\n

    Originally posted on Instagram

    \n`); + } + + return blocks.join('\n\n'); +} + +function escapeXml(value) { + return String(value) + .replaceAll('&', '&') + .replaceAll('<', '<') + .replaceAll('>', '>') + .replaceAll('"', '"') + .replaceAll("'", '''); +} + +function xmlValue(value) { + if (value == null) return ''; + if (Buffer.isBuffer(value)) return `${value.toString('base64')}`; + if (value instanceof Date) { + const iso = value.toISOString().replace(/[-:]/g, '').replace(/\.\d{3}Z$/, ''); + return `${iso}`; + } + if (typeof value === 'boolean') return `${value ? 1 : 0}`; + if (typeof value === 'number') return Number.isInteger(value) ? `${value}` : `${value}`; + if (Array.isArray(value)) { + return `${value.map(item => `${xmlValue(item)}`).join('')}`; + } + if (typeof value === 'object') { + return `${Object.entries(value) + .filter(([, item]) => item !== undefined) + .map(([key, item]) => `${escapeXml(key)}${xmlValue(item)}`) + .join('')}`; + } + return `${escapeXml(value)}`; +} + +function extractPostMeta(node) { + const caption = node.edge_media_to_caption?.edges?.[0]?.node?.text || ''; + return { + id: node.id, + shortcode: node.shortcode, + type: node.edge_sidecar_to_children ? 'carousel' : node.is_video ? 'video' : 'photo', + timestamp: node.taken_at_timestamp, + date: node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toISOString() : null, + caption, + displayUrl: node.display_url, + isVideo: !!node.is_video, + videoUrl: node.video_url || null, + accessibilityCaption: node.accessibility_caption || null, + locationName: node.location?.name || null, + likes: node.edge_media_preview_like?.count ?? null, + comments: node.edge_media_to_comment?.count ?? null, + carouselCount: node.edge_sidecar_to_children?.edges?.length || null, + url: `https://www.instagram.com/p/${node.shortcode}/`, + }; +} + +// ─── Tests ────────────────────────────────────────────────── + +describe('extractMeta', () => { + it('extracts title from first line of caption', () => { + const meta = extractMeta(singlePhotoPost); + assert.equal(meta.title, 'Hello @world! #sunset #photography'); + }); + + it('truncates long titles to 80 chars', () => { + const post = { ...singlePhotoPost, caption: 'A'.repeat(100) + '\nsecond line' }; + const meta = extractMeta(post); + assert.equal(meta.title.length, 80); + }); + + it('falls back to shortcode when no caption', () => { + const meta = extractMeta(noCaptionPost); + assert.equal(meta.title, 'Instagram NOCAP'); + }); + + it('generates ig- prefixed slug', () => { + const meta = extractMeta(singlePhotoPost); + assert.equal(meta.slug, 'ig-ABC123'); + }); + + it('preserves original publish date', () => { + const meta = extractMeta(singlePhotoPost); + assert.equal(meta.publishDate, '2023-06-15T14:30:00.000Z'); + }); + + it('sets featured image URL from first media item', () => { + const meta = extractMeta(singlePhotoPost); + assert.ok(meta.featuredImageUrl.includes('cdninstagram.com')); + }); +}); + +describe('buildInstagramContent', () => { + it('produces wp:image block for single photo', () => { + const content = buildInstagramContent(singlePhotoPost); + assert.ok(content.includes('')); + assert.ok(!content.includes('')); + assert.ok(content.includes('video.mp4')); + }); + + it('handles posts with no caption', () => { + const content = buildInstagramContent(noCaptionPost); + // Should have image but no caption paragraph (empty caption) + assert.ok(content.includes('')); + // Source link should still be present + assert.ok(content.includes('instagram-source')); + }); + + it('uses accessibility caption as alt text', () => { + const content = buildInstagramContent(singlePhotoPost); + assert.ok(content.includes('alt="A sunset over the ocean"')); + }); +}); + +describe('xmlValue', () => { + it('encodes strings with XML escaping', () => { + assert.equal(xmlValue('hello & '), 'hello & <world>'); + }); + + it('encodes integers', () => { + assert.equal(xmlValue(42), '42'); + }); + + it('encodes booleans', () => { + assert.equal(xmlValue(true), '1'); + assert.equal(xmlValue(false), '0'); + }); + + it('encodes dates WITHOUT trailing Z', () => { + const d = new Date('2020-03-15T12:00:00.000Z'); + const result = xmlValue(d); + assert.ok(result.includes('20200315T120000')); + assert.ok(!result.includes('Z')); + }); + + it('encodes buffers as base64', () => { + const buf = Buffer.from('hello'); + assert.equal(xmlValue(buf), 'aGVsbG8='); + }); + + it('encodes null as nil', () => { + assert.equal(xmlValue(null), ''); + }); + + it('encodes objects as structs', () => { + const result = xmlValue({ name: 'test', count: 5 }); + assert.ok(result.includes('')); + assert.ok(result.includes('name')); + assert.ok(result.includes('test')); + assert.ok(result.includes('5')); + }); + + it('skips undefined values in structs', () => { + const result = xmlValue({ name: 'test', missing: undefined }); + assert.ok(!result.includes('missing')); + }); + + it('encodes arrays', () => { + const result = xmlValue([1, 'two']); + assert.ok(result.includes('')); + assert.ok(result.includes('1')); + assert.ok(result.includes('two')); + }); +}); + +describe('extractPostMeta (discover)', () => { + it('classifies photo posts', () => { + const meta = extractPostMeta({ + id: '1', shortcode: 'TEST', display_url: 'http://example.com/img.jpg', + taken_at_timestamp: 1686830000, + edge_media_to_caption: { edges: [{ node: { text: 'Hello' } }] }, + }); + assert.equal(meta.type, 'photo'); + assert.equal(meta.caption, 'Hello'); + }); + + it('classifies carousel posts', () => { + const meta = extractPostMeta({ + id: '2', shortcode: 'CAR', display_url: 'http://example.com/img.jpg', + taken_at_timestamp: 1686830000, + edge_sidecar_to_children: { edges: [{ node: {} }, { node: {} }] }, + edge_media_to_caption: { edges: [] }, + }); + assert.equal(meta.type, 'carousel'); + assert.equal(meta.carouselCount, 2); + }); + + it('classifies video posts', () => { + const meta = extractPostMeta({ + id: '3', shortcode: 'VID', display_url: 'http://example.com/img.jpg', + taken_at_timestamp: 1686830000, is_video: true, + video_url: 'http://example.com/video.mp4', + edge_media_to_caption: { edges: [] }, + }); + assert.equal(meta.type, 'video'); + assert.ok(meta.isVideo); + assert.equal(meta.videoUrl, 'http://example.com/video.mp4'); + }); + + it('converts timestamp to ISO date', () => { + const meta = extractPostMeta({ + id: '4', shortcode: 'DATE', display_url: 'http://example.com/img.jpg', + taken_at_timestamp: 1686830000, + edge_media_to_caption: { edges: [] }, + }); + assert.ok(meta.date.startsWith('2023-06-15')); + }); + + it('extracts location name', () => { + const meta = extractPostMeta({ + id: '5', shortcode: 'LOC', display_url: 'http://example.com/img.jpg', + taken_at_timestamp: 1686830000, + location: { name: 'Central Park', id: '123' }, + edge_media_to_caption: { edges: [] }, + }); + assert.equal(meta.locationName, 'Central Park'); + }); + + it('generates correct Instagram URL', () => { + const meta = extractPostMeta({ + id: '6', shortcode: 'URL_TEST', display_url: 'http://example.com/img.jpg', + taken_at_timestamp: 1686830000, + edge_media_to_caption: { edges: [] }, + }); + assert.equal(meta.url, 'https://www.instagram.com/p/URL_TEST/'); + }); +}); + +describe('date formatting for WordPress', () => { + it('formats date as YYYY-MM-DD HH:MM:SS string', () => { + const d = new Date('2023-06-15T14:30:00.000Z'); + const formatted = `${d.getUTCFullYear()}-${String(d.getUTCMonth()+1).padStart(2,'0')}-${String(d.getUTCDate()).padStart(2,'0')} ${String(d.getUTCHours()).padStart(2,'0')}:${String(d.getUTCMinutes()).padStart(2,'0')}:${String(d.getUTCSeconds()).padStart(2,'0')}`; + assert.equal(formatted, '2023-06-15 14:30:00'); + }); + + it('pads single-digit months and days', () => { + const d = new Date('2023-01-05T03:09:07.000Z'); + const formatted = `${d.getUTCFullYear()}-${String(d.getUTCMonth()+1).padStart(2,'0')}-${String(d.getUTCDate()).padStart(2,'0')} ${String(d.getUTCHours()).padStart(2,'0')}:${String(d.getUTCMinutes()).padStart(2,'0')}:${String(d.getUTCSeconds()).padStart(2,'0')}`; + assert.equal(formatted, '2023-01-05 03:09:07'); + }); +}); + +describe('carousel deduplication logic', () => { + it('deduplicates by Instagram media ID', () => { + // Simulate what the extractor does: collect images across slides, + // deduplicate by the numeric media ID prefix in the CDN URL + const allImages = [ + { src: 'https://cdn.com/v/t51.29350-15/111_222.jpg', mediaId: '111' }, + { src: 'https://cdn.com/v/t51.82787-15/111_222.jpg', mediaId: '111' }, // same photo, different CDN path + { src: 'https://cdn.com/v/t51.82787-15/333_444.jpg', mediaId: '333' }, + { src: 'https://cdn.com/v/t51.82787-15/555_666.jpg', mediaId: '555' }, + ]; + + const seen = new Set(); + const deduped = []; + for (const img of allImages) { + if (!img.mediaId || seen.has(img.mediaId)) continue; + seen.add(img.mediaId); + deduped.push(img); + } + + assert.equal(deduped.length, 3); + assert.deepEqual(deduped.map(d => d.mediaId), ['111', '333', '555']); + }); +});