diff --git a/README.zh-CN.md b/README.zh-CN.md index 624760a7..33afaeb1 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -154,6 +154,7 @@ npm install -g @jackwener/opencli@latest | **jd** | `item` | 浏览器 | | **linkedin** | `search` `timeline` | 浏览器 | | **reuters** | `search` | 浏览器 | +| **webofscience** | `smart-search` `basic-search` `author-search` `author-record` `citing-articles` `references` `record` | 浏览器 | | **smzdm** | `search` | 浏览器 | | **web** | `read` | 浏览器 | | **weibo** | `hot` `search` | 浏览器 | diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index 5b4a58cd..a5d15857 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -63,6 +63,7 @@ export default defineConfig({ { text: 'BOSS Zhipin', link: '/adapters/browser/boss' }, { text: 'Ctrip', link: '/adapters/browser/ctrip' }, { text: 'Reuters', link: '/adapters/browser/reuters' }, + { text: 'Web of Science', link: '/adapters/browser/webofscience' }, { text: 'SMZDM', link: '/adapters/browser/smzdm' }, { text: 'Jike', link: '/adapters/browser/jike' }, { text: 'Jimeng', link: '/adapters/browser/jimeng' }, diff --git a/docs/adapters/browser/webofscience.md b/docs/adapters/browser/webofscience.md new file mode 100644 index 00000000..073a2a1b --- /dev/null +++ b/docs/adapters/browser/webofscience.md @@ -0,0 +1,100 @@ +# Web of Science + +**Mode**: 🔐 Browser · **Domain**: `webofscience.clarivate.cn` + +## Commands + +| Command | Description | +|---------|-------------| +| `opencli webofscience smart-search` | Search Web of Science records from `woscc` or `alldb` through Smart Search | +| `opencli webofscience basic-search` | Search Web of Science through the Basic Search page | +| `opencli webofscience author-search` | Search Web of Science researcher profiles | +| `opencli webofscience author-record` | Fetch a Web of Science researcher author record by id or URL | +| `opencli webofscience citing-articles` | List articles citing a Web of Science record | +| `opencli webofscience references` | List cited references for a Web of Science record | +| `opencli webofscience record` | Fetch a full record by UT, DOI, or full-record URL | + +## Usage Examples + +```bash +# Quick start +opencli webofscience smart-search "machine learning" --limit 5 + +# Search across all databases +opencli webofscience smart-search "machine learning" --database alldb --limit 5 + +# Use the basic-search entrypoint +opencli webofscience basic-search "graph neural networks" --database woscc + +# Restrict basic-search to a specific field +opencli webofscience basic-search "machine learning" --field title +opencli webofscience basic-search "Yann LeCun" --field author +opencli webofscience basic-search "10.1016/j.patter.2024.101046" --field doi + +# Search researcher profiles +opencli webofscience author-search "Jane Doe" + +# Refine researcher profiles by claimed status and facets +opencli webofscience author-search "Yann LeCun" --claimed-status claimed --affiliation Meta +opencli webofscience author-search "Yann LeCun" --country USA --category "Computer Science" +opencli webofscience author-search "Yann LeCun" --author "Yann LeCUN" +opencli webofscience author-search "Yann LeCun" --award-year 2024 --award-category NSF + +# Fetch a full record by UT +opencli webofscience record WOS:001335131500001 + +# Fetch a full record by DOI from all databases +opencli webofscience record 10.1016/j.patter.2024.101046 --database alldb + +# Fetch author details by author-record id +opencli webofscience author-record 89895674 + +# Fetch citing articles or cited references +opencli webofscience citing-articles WOS:001335131500001 --limit 5 +opencli webofscience references WOS:001335131500001 --limit 5 + +# JSON output +opencli webofscience smart-search "graph neural networks" -f json + +# Verbose mode +opencli webofscience smart-search "causal inference" -v +``` + +## Output Fields + +- `rank` +- `title` +- `authors` +- `year` +- `source` +- `citations` +- `doi` +- `url` + +`author-search` returns `rank`, `name`, `details`, `affiliations`, `location`, `researcher_id`, `published_names`, `top_journals`, and the author profile URL. + +`author-search` supports researcher-result refine filters through `--claimed-status`, `--author`, `--affiliation`, `--country`, `--category`, `--award-year`, and `--award-category`. These accept the labels shown in the current results page facets; multi-value filters can be passed as comma- or semicolon-separated lists. + +`basic-search` supports `--field` with the Web of Science Basic Search field set, including `topic`, `all-fields`, `title`, `author`, `publication-titles`, `year-published`, `affiliation`, `funding-agency`, `publisher`, `publication-date`, `abstract`, `accession-number`, `address`, `author-identifiers`, `author-keywords`, `conference`, `document-type`, `doi`, `editor`, `grant-number`, `group-author`, `keyword-plus`, `language`, `pubmed-id`, and `web-of-science-categories`. + +`record` returns `field` / `value` rows, including title, authors, abstract, UT, DOI, document type, publication/indexing metadata, corresponding address, author addresses, email addresses, research areas, Web of Science categories, `authors_structured`, citation counts, full-text link labels/URLs, and the full-record URL when available. + +`author-record` returns `field` / `value` rows for researcher profile metadata, including name, display name, affiliations, location, ResearcherID, published names, subject categories, key metrics, co-authors, and the publications summary URL when available. + +`citing-articles` and `references` return the same structured list fields as `smart-search`, but scoped to a seed record's citation network. + +## Prerequisites + +- Chrome running with access to your Web of Science institution/subscription +- [Browser Bridge extension](/guide/browser-bridge) installed + +## Notes + +- The adapter uses the Smart Search page, then replays the underlying `runQuerySearch` request for structured results. +- `basic-search` reuses the same structured search backend, but starts from the Basic Search page instead of Smart Search. +- `author-search` uses browser-driven page interaction for both the autocomplete search form and the researcher results refine facets. It supports the same visible filters exposed by the result page, including claimed status, author, affiliation, country/region, Web of Science categories, and award-related facets when Web of Science exposes them for the current result set. +- `author-record` uses the author profile page directly and extracts the fields that are only visible on the profile page. +- `citing-articles` and `references` navigate to the corresponding Web of Science summary pages, then replay the summary query through the in-page search state that Web of Science stores in browser storage. +- `record` performs an exact search first to establish a query session, then requests `getFullRecordByQueryId` for the matching document. +- `record` also opens the full-record page to enrich the output with page-only fields such as full-text links and publication metadata that are not always present in the structured API payload. +- Web of Science may trigger passive verification before the first search. The adapter retries once automatically when the initial session is not ready. diff --git a/docs/adapters/index.md b/docs/adapters/index.md index 7a82f177..28bfbf10 100644 --- a/docs/adapters/index.md +++ b/docs/adapters/index.md @@ -21,6 +21,7 @@ Run `opencli list` for the live registry. | **[boss](/adapters/browser/boss)** | `search` `detail` `recommend` `joblist` `greet` `batchgreet` `send` `chatlist` `chatmsg` `invite` `mark` `exchange` `resume` `stats` | 🔐 Browser | | **[ctrip](/adapters/browser/ctrip)** | `search` | 🔐 Browser | | **[reuters](/adapters/browser/reuters)** | `search` | 🔐 Browser | +| **[webofscience](/adapters/browser/webofscience)** | `smart-search` `basic-search` `author-search` `author-record` `citing-articles` `references` `record` | 🔐 Browser | | **[smzdm](/adapters/browser/smzdm)** | `search` | 🔐 Browser | | **[jike](/adapters/browser/jike)** | `feed` `search` `post` `topic` `user` `create` `comment` `like` `repost` `notifications` | 🔐 Browser | | **[jimeng](/adapters/browser/jimeng)** | `generate` `history` | 🔐 Browser | diff --git a/src/browser/daemon-client.ts b/src/browser/daemon-client.ts index 72dae06f..50e18f4d 100644 --- a/src/browser/daemon-client.ts +++ b/src/browser/daemon-client.ts @@ -112,7 +112,8 @@ export async function sendCommand( const isTransient = errMsg.includes('Extension disconnected') || errMsg.includes('Extension not connected') || errMsg.includes('attach failed') - || errMsg.includes('no longer exists'); + || errMsg.includes('no longer exists') + || errMsg.includes('Detached while handling command'); if (isTransient && attempt < maxRetries) { // Longer delay for extension recovery (service worker restart) await sleep(1500); @@ -140,4 +141,3 @@ export async function listSessions(): Promise { const result = await sendCommand('sessions'); return Array.isArray(result) ? result : []; } - diff --git a/src/clis/webofscience/author-record.test.ts b/src/clis/webofscience/author-record.test.ts new file mode 100644 index 00000000..78fc2b67 --- /dev/null +++ b/src/clis/webofscience/author-record.test.ts @@ -0,0 +1,142 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import './author-record.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience author-record', () => { + it('describes supported author-record identifiers in command help', () => { + const cmd = getRegistry().get('webofscience/author-record'); + const idArg = cmd?.args.find(arg => arg.name === 'id'); + + expect(idArg?.help).toContain('89895674'); + expect(idArg?.help).toContain('author-record URL'); + }); + + it('extracts a structured researcher profile from selector-driven page data', async () => { + const cmd = getRegistry().get('webofscience/author-record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { + name: 'Yann LeCun', + displayName: 'LeCun, Yann', + affiliations: ['Meta FAIR', 'New York University'], + location: 'NEW YORK CITY, NY, USA', + researcherId: 'PQF-7882-2026', + publishedNames: ['LECUN, Y', 'Yann LeCun'], + subjectCategories: ['Computer Science', 'Artificial Intelligence'], + coAuthors: ['Yoshua Bengio', 'Geoffrey Hinton'], + metricsText: `147 Total documents +12 Web of Science Core Collection publications +135 Preprints +3 Awarded grants +64 H-Index +1989-2025 Publications +152345 Sum of Times Cited +87211 Citing Articles`, + links: [ + { label: 'Web of Science Core Collection publications', url: 'https://webofscience.clarivate.cn/wos/woscc/general-summary/x' }, + ], + }, + ]); + + const result = await cmd!.func!(page, { id: '89895674' }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/author/record/89895674', + { settleMs: 5000 }, + ); + const scrapeJs = vi.mocked(page.evaluate).mock.calls[0]?.[0]; + expect(scrapeJs).toContain('app-author-record-header'); + expect(scrapeJs).toContain('app-display-data'); + expect(scrapeJs).toContain('app-metrics-column'); + + expect(result).toEqual([ + { field: 'name', value: 'Yann LeCun' }, + { field: 'display_name', value: 'LeCun, Yann' }, + { field: 'affiliations', value: 'Meta FAIR; New York University' }, + { field: 'location', value: 'NEW YORK CITY, NY, USA' }, + { field: 'researcher_id', value: 'PQF-7882-2026' }, + { field: 'published_names', value: 'LECUN, Y; Yann LeCun' }, + { field: 'subject_categories', value: 'Computer Science; Artificial Intelligence' }, + { field: 'documents', value: '147' }, + { field: 'woscc_publications', value: '12' }, + { field: 'preprints', value: '135' }, + { field: 'awarded_grants', value: '3' }, + { field: 'h_index', value: '64' }, + { field: 'publications_range', value: '1989-2025' }, + { field: 'times_cited', value: '152345' }, + { field: 'citing_articles', value: '87211' }, + { field: 'co_authors', value: 'Yoshua Bengio; Geoffrey Hinton' }, + { field: 'publications_url', value: 'https://webofscience.clarivate.cn/wos/woscc/general-summary/x' }, + { field: 'url', value: 'https://webofscience.clarivate.cn/wos/author/record/89895674' }, + ]); + }); + + it('accepts an author record URL as input', async () => { + const cmd = getRegistry().get('webofscience/author-record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { name: 'Yann LeCun', researcherId: 'PQF-7882-2026', metricsText: '', links: [] }, + ]); + + await cmd!.func!(page, { id: 'https://webofscience.clarivate.cn/wos/author/record/89895674' }); + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/author/record/89895674', + { settleMs: 5000 }, + ); + }); + + it('rejects unsupported author record identifiers', async () => { + const cmd = getRegistry().get('webofscience/author-record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([]); + await expect(cmd!.func!(page, { id: 'not-a-record' })).rejects.toThrow(ArgumentError); + }); + + it('throws EmptyResultError when the author record page contains no usable profile data', async () => { + const cmd = getRegistry().get('webofscience/author-record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { name: '', researcherId: '', metricsText: '', links: [] }, + ]); + + await expect(cmd!.func!(page, { id: '89895674' })).rejects.toThrow(EmptyResultError); + }); +}); diff --git a/src/clis/webofscience/author-record.ts b/src/clis/webofscience/author-record.ts new file mode 100644 index 00000000..35f26b85 --- /dev/null +++ b/src/clis/webofscience/author-record.ts @@ -0,0 +1,290 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { + authorRecordUrl, + parseAuthorRecordIdentifier, +} from './shared.js'; + +function normalizeText(value: string): string { + return String(value || '').replace(/\u00a0/g, ' ').replace(/\s+/g, ' ').trim(); +} + +function lines(body: string): string[] { + return String(body || '') + .replace(/\u00a0/g, ' ') + .split('\n') + .map(line => line.trim()) + .filter(Boolean); +} + +type AuthorRecordScrape = { + name?: string; + displayName?: string; + affiliations?: string[]; + location?: string; + researcherId?: string; + publishedNames?: string[]; + subjectCategories?: string[]; + coAuthors?: string[]; + metricsText?: string; + links?: Array<{ label?: string; url?: string }>; +}; + +function unique(values: string[]): string[] { + return Array.from(new Set(values.map(normalizeText).filter(Boolean))); +} + +function metricLines(metricsText: string): string[] { + return lines(metricsText); +} + +function extractMetric(metricsText: string, patterns: RegExp[]): string { + const all = metricLines(metricsText); + for (const line of all) { + for (const pattern of patterns) { + const match = line.match(pattern); + if (match?.[1]) { + return normalizeText(match[1]); + } + } + } + const whole = normalizeText(metricsText); + for (const pattern of patterns) { + const match = whole.match(pattern); + if (match?.[1]) { + return normalizeText(match[1]); + } + } + return ''; +} + +function documentsMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+Total documents$/i, /^Documents\s+(\d+)$/i, /^(\d+)\s+Documents$/i]); +} + +function wosccPublicationsMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+Web of Science Core Collection publications$/i]); +} + +function preprintsMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+Preprints$/i]); +} + +function awardedGrantsMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+Awarded grants$/i]); +} + +function hIndexMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+H-Index$/i, /^(\d+)\s+h-index$/i]); +} + +function publicationsRangeMetric(metricsText: string): string { + return extractMetric(metricsText, [/^([0-9]{4}\s*-\s*[0-9]{4})\s+Publications$/i]); +} + +function timesCitedMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+Sum of Times Cited$/i, /^(\d+)\s+Sum of times cited$/i]); +} + +function citingArticlesMetric(metricsText: string): string { + return extractMetric(metricsText, [/^(\d+)\s+Citing Articles$/i, /^(\d+)\s+Citing articles$/i]); +} + +function normalizeLocation(value: string): string { + const normalized = normalizeText(value); + const tailMatch = normalized.match(/([A-Z][A-Z .'-]+,\s*[A-Z]{2},\s*[A-Z]{3,})$/); + return normalizeText(tailMatch?.[1] || normalized); +} + +async function scrapeAuthorRecord( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + evaluate: (js: string) => Promise; + newTab?: () => Promise; + }, + url: string, +): Promise { + const readOnce = () => page.evaluate(`(() => { + const normalize = (value) => String(value || '').replace(/\\u00a0/g, ' ').replace(/\\s+/g, ' ').trim(); + const splitLines = (value) => String(value || '') + .replace(/\\u00a0/g, ' ') + .split('\\n') + .map(line => normalize(line)) + .filter(Boolean); + const unique = (values) => Array.from(new Set(values.map(value => normalize(value)).filter(Boolean))); + const header = document.querySelector('app-author-record-header'); + const headerTokenTexts = Array.from(header?.querySelectorAll('h1, h2, h3, h4, em, span, a') || []) + .map(node => normalize(node.textContent || '')) + .filter(Boolean); + const headerTexts = unique(headerTokenTexts.length ? headerTokenTexts : splitLines(header?.innerText || '')); + const displayBlocks = Array.from(document.querySelectorAll('app-display-data')) + .map((block) => { + const spanTexts = Array.from(block.querySelectorAll('span, a, li, p')) + .map(node => normalize(node.textContent || '')) + .filter(Boolean); + const lineTexts = splitLines(block instanceof HTMLElement ? block.innerText : block?.textContent || ''); + return unique(spanTexts.length ? spanTexts : lineTexts); + }) + .filter(block => block.length); + const metricRoot = document.querySelector('app-metrics-column'); + const metricsText = unique([ + ...Array.from(metricRoot?.querySelectorAll('span, a, li, p, div, h1, h2, h3, h4') || []).map(node => normalize(node.textContent || '')), + ...splitLines(metricRoot instanceof HTMLElement ? metricRoot.innerText : metricRoot?.textContent || ''), + ]).join('\\n'); + const section = (label) => { + const lower = String(label || '').toLowerCase(); + for (const block of displayBlocks) { + const index = block.findIndex(line => line.toLowerCase() === lower); + if (index >= 0) { + return unique(block.slice(index + 1)); + } + } + return []; + }; + const name = normalize(header?.querySelector('h1')?.textContent || headerTexts[0] || ''); + const displayNameLine = headerTexts.find(line => /^\\(.+\\)$/.test(line)) || ''; + const displayName = normalize(displayNameLine.replace(/^\\(|\\)$/g, '')); + const normalizeLocation = (value) => { + const normalized = normalize(value); + const tailMatch = normalized.match(/([A-Z][A-Z .'-]+,\\s*[A-Z]{2},\\s*[A-Z]{3,})$/); + return normalize(tailMatch?.[1] || normalized); + }; + const location = headerTexts + .map(line => normalizeLocation(line)) + .find(line => /, [A-Z]{2}, [A-Z]{3,}$/.test(line) || /^[A-Z .'-]+, [A-Z]{2}, [A-Z]{3,}$/.test(line)) || ''; + const researcherId = headerTexts.find(line => /^[A-Z]{3}-\\d{4}-\\d{4}$/.test(line)) + || displayBlocks.flat().find(line => /^[A-Z]{3}-\\d{4}-\\d{4}$/.test(line)) + || ''; + const publishedNames = section('Published names'); + const organizations = section('Organizations'); + const subjectCategories = section('Subject Categories'); + const coAuthors = section('Co-authors'); + const affiliations = unique(organizations.length + ? organizations + : headerTexts.filter((line) => { + return line + && line !== name + && line !== displayNameLine + && line !== location + && line !== researcherId + && !/algorithmically generated author record/i.test(line) + && line !== 'Web of Science ResearcherID'; + })); + const links = Array.from(document.querySelectorAll('a')) + .map((el) => ({ + label: normalize(el.textContent || el.getAttribute('aria-label') || ''), + url: String((el instanceof HTMLAnchorElement ? el.href : el.getAttribute('href')) || '').trim(), + })) + .filter((item) => item.url); + + return { + name, + displayName, + affiliations, + location, + researcherId, + publishedNames, + subjectCategories, + coAuthors, + metricsText, + links, + }; + })()`); + + for (let round = 0; round < 2; round++) { + if (round > 0 && typeof page.newTab === 'function') { + await page.newTab(); + } + await page.goto(url, { settleMs: 5000 }); + + for (let attempt = 0; attempt < 3; attempt++) { + await page.wait(2 + (attempt * 2)); + const result = (await readOnce()) as AuthorRecordScrape | undefined; + const name = normalizeText(result?.name || ''); + const displayName = normalizeText(result?.displayName || ''); + const location = normalizeText(result?.location || ''); + const researcherId = normalizeText(result?.researcherId || ''); + const metricsText = String(result?.metricsText || ''); + if (name || researcherId || metricsText) { + return { + name, + displayName, + affiliations: unique(Array.isArray(result?.affiliations) ? result!.affiliations! : []), + location, + researcherId, + publishedNames: unique(Array.isArray(result?.publishedNames) ? result!.publishedNames! : []), + subjectCategories: unique(Array.isArray(result?.subjectCategories) ? result!.subjectCategories! : []), + coAuthors: unique(Array.isArray(result?.coAuthors) ? result!.coAuthors! : []), + metricsText, + links: Array.isArray(result?.links) ? result!.links! : [], + }; + } + } + } + + return {}; +} + +cli({ + site: 'webofscience', + name: 'author-record', + description: 'Fetch a Web of Science researcher author record', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'id', positional: true, required: true, help: 'Numeric author record id or author-record URL, e.g. 89895674' }, + ], + columns: ['field', 'value'], + func: async (page, kwargs) => { + const rawId = String(kwargs.id ?? '').trim(); + if (!rawId) throw new ArgumentError('Author record identifier is required'); + + const identifier = parseAuthorRecordIdentifier(rawId); + if (!identifier) { + throw new ArgumentError('Author record identifier must be a numeric id like 89895674 or an author-record URL'); + } + + const url = authorRecordUrl(identifier.id); + const scraped = await scrapeAuthorRecord(page, url); + const name = normalizeText(scraped.name || ''); + const displayName = normalizeText(scraped.displayName || ''); + const affiliations = unique(scraped.affiliations || []); + const location = normalizeText(scraped.location || ''); + const researcherId = normalizeText(scraped.researcherId || ''); + const publishedNames = unique(scraped.publishedNames || []); + const subjectCategories = unique(scraped.subjectCategories || []); + const coAuthors = unique(scraped.coAuthors || []); + const metricsText = String(scraped.metricsText || ''); + const publicationsUrl = (scraped.links || []).find(link => /publications/i.test(link.label || '') && /general-summary/.test(link.url || ''))?.url || ''; + + const rows = [ + { field: 'name', value: name }, + { field: 'display_name', value: displayName }, + { field: 'affiliations', value: affiliations.join('; ') }, + { field: 'location', value: location }, + { field: 'researcher_id', value: researcherId }, + { field: 'published_names', value: publishedNames.join('; ') }, + { field: 'subject_categories', value: subjectCategories.join('; ') }, + { field: 'documents', value: documentsMetric(metricsText) }, + { field: 'woscc_publications', value: wosccPublicationsMetric(metricsText) }, + { field: 'preprints', value: preprintsMetric(metricsText) }, + { field: 'awarded_grants', value: awardedGrantsMetric(metricsText) }, + { field: 'h_index', value: hIndexMetric(metricsText) }, + { field: 'publications_range', value: publicationsRangeMetric(metricsText) }, + { field: 'times_cited', value: timesCitedMetric(metricsText) }, + { field: 'citing_articles', value: citingArticlesMetric(metricsText) }, + { field: 'co_authors', value: coAuthors.join('; ') }, + { field: 'publications_url', value: publicationsUrl }, + { field: 'url', value: url }, + ].filter(row => row.value); + + if (!rows.length || !name) { + throw new EmptyResultError('webofscience author-record', 'Try opening the author record in Chrome once, then run again.'); + } + + return rows; + }, +}); diff --git a/src/clis/webofscience/author-search.test.ts b/src/clis/webofscience/author-search.test.ts new file mode 100644 index 00000000..f334f457 --- /dev/null +++ b/src/clis/webofscience/author-search.test.ts @@ -0,0 +1,269 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import { normalizeAuthorSearchFilters } from './author-search.js'; +import './author-search.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience author-search', () => { + it('describes refine filters and multi-value input in command help', () => { + const cmd = getRegistry().get('webofscience/author-search'); + const claimedStatusArg = cmd?.args.find(arg => arg.name === 'claimed-status'); + const affiliationArg = cmd?.args.find(arg => arg.name === 'affiliation'); + + expect(claimedStatusArg?.help).toContain('claimed'); + expect(claimedStatusArg?.help).toContain('unclaimed'); + expect(claimedStatusArg?.help).toContain('result page'); + expect(affiliationArg?.help).toContain('semicolon-separated'); + expect(affiliationArg?.help).toContain('current result page'); + }); + + it('normalizes researcher refine filters from comma-separated CLI args', () => { + expect(normalizeAuthorSearchFilters({ + 'claimed-status': 'claimed', + author: 'Yann LeCun; LeCun, YANN', + affiliation: 'Meta AI, NYU', + country: 'USA, France', + category: 'Computer Science, Mathematics', + 'award-year': '2024, 2025', + 'award-category': 'NIH, NSF', + })).toEqual({ + claimedStatus: 'claimed', + authors: ['Yann LeCun', 'LeCun, YANN'], + affiliations: ['Meta AI', 'NYU'], + countries: ['USA', 'France'], + categories: ['Computer Science', 'Mathematics'], + awardYears: ['2024', '2025'], + awardCategories: ['NIH', 'NSF'], + }); + }); + + it('rejects unsupported claimed-status filters', () => { + expect(() => normalizeAuthorSearchFilters({ 'claimed-status': 'maybe' })).toThrow( + 'Unsupported Web of Science researcher claimed-status filter: maybe. Use one of: claimed, unclaimed', + ); + }); + + it('submits the author search page and maps researcher results', async () => { + const cmd = getRegistry().get('webofscience/author-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + href: 'https://webofscience.clarivate.cn/wos/author/summary/demo/doc-relevance/1', + text: '6 results from Web of Science Researchers for: Jane Doe', + }, + [ + { + name: 'Jane Doe', + details: 'University of Testing Highly Cited Researcher', + affiliations: ['University of Testing'], + location: 'Boston, MA, USA', + researcher_id: 'ABC-1234-2026', + published_names: ['DOE, J', 'Jane Doe'], + top_journals: ['TEST JOURNAL', 'EXAMPLE LETTERS'], + url: 'https://webofscience.clarivate.cn/author/record/A-1234-2024', + }, + { + name: 'John Smith', + details: 'Institute of Examples', + url: 'https://webofscience.clarivate.cn/author/record/B-9999-2020', + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'jane doe', limit: 1 }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/author/author-search', + { settleMs: 4000 }, + ); + const submitJs = vi.mocked(page.evaluate).mock.calls[0]?.[0]; + expect(submitJs).toContain(`input[name="' + name + '"]`); + expect(submitJs).toContain(`findInput('lastName', 'Last Name')`); + expect(submitJs).toContain(`findInput('firstName', 'First Name')`); + expect(submitJs).toContain('selectSuggestion'); + expect(submitJs).toContain('"doe"'); + expect(submitJs).toContain('"jane"'); + expect(result).toEqual([ + { + rank: 1, + name: 'Jane Doe', + details: 'University of Testing Highly Cited Researcher', + affiliations: ['University of Testing'], + location: 'Boston, MA, USA', + researcher_id: 'ABC-1234-2026', + published_names: ['DOE, J', 'Jane Doe'], + top_journals: ['TEST JOURNAL', 'EXAMPLE LETTERS'], + url: 'https://webofscience.clarivate.cn/author/record/A-1234-2024', + }, + ]); + }); + + it('applies claimed-status and researcher facet filters before scraping', async () => { + const cmd = getRegistry().get('webofscience/author-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + href: 'https://webofscience.clarivate.cn/wos/author/summary/demo/doc-relevance/1', + text: '6 results from Web of Science Researchers for: Yann LeCun', + }, + true, + true, + true, + true, + true, + true, + true, + true, + [ + { + name: 'Yann LeCun (LeCun, Yann)', + details: 'Meta FAIR NEW YORK CITY, NY, USA', + url: 'https://webofscience.clarivate.cn/wos/author/record/89895674', + }, + ], + ]); + + await cmd!.func!(page, { + query: 'Yann LeCun', + 'claimed-status': 'claimed', + affiliation: 'Meta', + country: 'USA', + category: 'Computer Science', + author: 'Yann LECUN', + 'award-year': '2024', + 'award-category': 'NSF', + }); + + const filterJs = vi.mocked(page.evaluate).mock.calls[2]?.[0]; + expect(filterJs).toContain('"claimedStatus":"claimed"'); + expect(filterJs).toContain('"authors":["Yann LECUN"]'); + expect(filterJs).toContain('"affiliations":["Meta"]'); + expect(filterJs).toContain('"countries":["USA"]'); + expect(filterJs).toContain('"categories":["Computer Science"]'); + expect(filterJs).toContain('"awardYears":["2024"]'); + expect(filterJs).toContain('"awardCategories":["NSF"]'); + expect(filterJs).toContain('findCheckbox'); + expect(filterJs).toContain('findRefineButton'); + + const awardFilterJs = vi.mocked(page.evaluate).mock.calls[7]?.[0]; + expect(awardFilterJs).toContain('"name":"GRANTSAWARDED"'); + + const awardYearFilterJs = vi.mocked(page.evaluate).mock.calls[8]?.[0]; + expect(awardYearFilterJs).toContain('"awardYears":["2024"]'); + expect(awardYearFilterJs).toContain('"name":"AY"'); + + const awardCategoryFilterJs = vi.mocked(page.evaluate).mock.calls[9]?.[0]; + expect(awardCategoryFilterJs).toContain('"awardCategories":["NSF"]'); + expect(awardCategoryFilterJs).toContain('"name":"AC"'); + }); + + it('does not shadow the browser location object while scraping results', async () => { + const cmd = getRegistry().get('webofscience/author-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + href: 'https://webofscience.clarivate.cn/wos/author/summary/demo/doc-relevance/1', + text: '2 results from Web of Science Researchers for: Yann LeCun', + }, + [], + ]); + + await expect(cmd!.func!(page, { query: 'Yann LeCun', limit: 1 })).rejects.toThrow(EmptyResultError); + + const scrapeJs = vi.mocked(page.evaluate).mock.calls[2]?.[0]; + expect(scrapeJs).not.toContain('const location ='); + expect(scrapeJs).toContain('new URL(href, location.origin)'); + }); + + it('keeps affiliation extraction separate from published names and top journals', async () => { + const cmd = getRegistry().get('webofscience/author-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + href: 'https://webofscience.clarivate.cn/wos/author/summary/demo/doc-relevance/1', + text: '2 results from Web of Science Researchers for: Yann LeCun', + }, + [ + { + name: 'Yann LeCun (LeCun, Yann)', + details: 'Meta FAIR NEW YORK CITY, NY, USA', + affiliations: ['Meta FAIR'], + location: 'NEW YORK CITY, NY, USA', + researcher_id: 'PQF-7882-2026', + published_names: ['Yann LeCun'], + top_journals: ['ARXIV'], + url: 'https://webofscience.clarivate.cn/wos/author/record/89895674', + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'Yann LeCun', limit: 1 }); + + expect(result).toEqual([ + expect.objectContaining({ + affiliations: ['Meta FAIR'], + published_names: ['Yann LeCun'], + top_journals: ['ARXIV'], + }), + ]); + + const scrapeJs = vi.mocked(page.evaluate).mock.calls[2]?.[0]; + expect(scrapeJs).toContain('p.font-size-14:not(.meta-item)'); + }); + + it('throws EmptyResultError when no authors are found', async () => { + const cmd = getRegistry().get('webofscience/author-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + href: 'https://webofscience.clarivate.cn/wos/author/summary/demo/doc-relevance/1', + text: '0 results from Web of Science Researchers for: nobody', + }, + [], + ]); + await expect(cmd!.func!(page, { query: 'nobody' })).rejects.toThrow(EmptyResultError); + }); +}); diff --git a/src/clis/webofscience/author-search.ts b/src/clis/webofscience/author-search.ts new file mode 100644 index 00000000..67784a6f --- /dev/null +++ b/src/clis/webofscience/author-search.ts @@ -0,0 +1,469 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, CommandExecutionError, EmptyResultError } from '../../errors.js'; + +const AUTHOR_SEARCH_URL = 'https://webofscience.clarivate.cn/wos/author/author-search'; +const AUTHOR_RESULTS_HINT = 'results from Web of Science Researchers'; + +type ClaimedStatus = 'claimed' | 'unclaimed'; + +export type AuthorSearchFilters = { + claimedStatus?: ClaimedStatus; + authors: string[]; + affiliations: string[]; + countries: string[]; + categories: string[]; + awardYears: string[]; + awardCategories: string[]; +}; + +function splitAuthorQuery(query: string): { firstName: string; lastName: string } { + const normalized = query.trim().replace(/\s+/g, ' '); + if (!normalized) { + return { firstName: '', lastName: '' }; + } + + if (normalized.includes(',')) { + const [lastName, ...rest] = normalized.split(','); + return { + lastName: lastName.trim(), + firstName: rest.join(' ').trim(), + }; + } + + const parts = normalized.split(' ').filter(Boolean); + if (parts.length === 1) { + return { firstName: '', lastName: parts[0] }; + } + + return { + firstName: parts.slice(0, -1).join(' '), + lastName: parts[parts.length - 1], + }; +} + +function splitCsv(value: unknown): string[] { + const text = String(value ?? ''); + const delimiter = text.includes(';') ? ';' : ','; + return text + .split(delimiter) + .map(item => item.trim()) + .filter(Boolean); +} + +function normalizeClaimedStatus(value: unknown): ClaimedStatus | undefined { + if (value == null || value === '') return undefined; + + const normalized = String(value).trim().toLowerCase(); + if (['claimed', 'claim', 'profile', 'claimed-profile', 'claimed profiles', 'true'].includes(normalized)) { + return 'claimed'; + } + if (['unclaimed', 'unclaim', 'author-record', 'author records', 'unclaimed-profile', 'unclaimed profiles', 'false'].includes(normalized)) { + return 'unclaimed'; + } + + throw new ArgumentError( + `Unsupported Web of Science researcher claimed-status filter: ${String(value)}. Use one of: claimed, unclaimed`, + ); +} + +export function normalizeAuthorSearchFilters(kwargs: Record): AuthorSearchFilters { + return { + claimedStatus: normalizeClaimedStatus(kwargs['claimed-status']), + authors: splitCsv(kwargs.author), + affiliations: splitCsv(kwargs.affiliation), + countries: splitCsv(kwargs.country), + categories: splitCsv(kwargs.category), + awardYears: splitCsv(kwargs['award-year']), + awardCategories: splitCsv(kwargs['award-category']), + }; +} + +function hasAuthorSearchFilters(filters: AuthorSearchFilters): boolean { + return Boolean( + filters.claimedStatus + || filters.authors.length + || filters.affiliations.length + || filters.countries.length + || filters.categories.length + || filters.awardYears.length + || filters.awardCategories.length, + ); +} + +async function submitAuthorSearch( + page: { + evaluate: (js: string) => Promise; + wait: (seconds: number) => Promise; + }, + query: { firstName: string; lastName: string }, +): Promise { + let lastError: unknown; + for (let attempt = 0; attempt < 3; attempt++) { + try { + await page.evaluate(`(async () => { + const queryParts = ${JSON.stringify(query)}; + const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + const normalize = (text) => String(text || '').replace(/\\s+/g, ' ').trim().toLowerCase(); + const isVisible = (el) => { + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + return style.display !== 'none' + && style.visibility !== 'hidden' + && rect.width > 0 + && rect.height > 0; + }; + const setNativeValue = (input, value) => { + if (!input) return false; + const proto = Object.getPrototypeOf(input); + const descriptor = Object.getOwnPropertyDescriptor(proto, 'value') + || Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value'); + descriptor?.set?.call(input, value); + input.dispatchEvent(new Event('input', { bubbles: true })); + input.dispatchEvent(new Event('change', { bubbles: true })); + return true; + }; + const findInput = (name, fallbackLabel) => { + return document.querySelector('input[name="' + name + '"]') + || Array.from(document.querySelectorAll('input[type="text"], input')) + .find((el) => isVisible(el) && normalize(el.getAttribute('aria-label')) === normalize(fallbackLabel)); + }; + const selectSuggestion = async (value) => { + if (!value) return true; + const needle = normalize(value).toUpperCase(); + await sleep(350); + const options = Array.from(document.querySelectorAll('[role="option"], mat-option')) + .filter((el) => isVisible(el)); + const option = options.find((el) => normalize(el.textContent).toUpperCase() === needle) + || options.find((el) => normalize(el.textContent).toUpperCase().startsWith(needle)) + || options.find((el) => normalize(el.textContent).toUpperCase().includes(needle)); + option?.click?.(); + await sleep(150); + return Boolean(option); + }; + const clickSearch = () => { + const button = document.querySelector('button.search[type="submit"], button.search') + || Array.from(document.querySelectorAll('button')) + .find((el) => isVisible(el) + && ( + String(el.getAttribute('aria-label') || '').trim().toLowerCase() === 'search' + || normalize(el.textContent) === 'search' + )); + button?.click?.(); + return Boolean(button); + }; + + const lastNameInput = findInput('lastName', 'Last Name'); + const firstNameInput = findInput('firstName', 'First Name'); + if (!lastNameInput) throw new Error('Author search last-name input not found'); + + setNativeValue(lastNameInput, queryParts.lastName); + await selectSuggestion(queryParts.lastName); + + if (queryParts.firstName && firstNameInput) { + setNativeValue(firstNameInput, queryParts.firstName); + await selectSuggestion(queryParts.firstName); + } + + if (!clickSearch()) { + throw new Error('Author search submit button not found'); + } + + return true; + })()`); + return; + } catch (error) { + lastError = error; + if (attempt < 2) { + await page.wait(2); + } + } + } + + throw lastError; +} + +async function waitForAuthorSearchLanding( + page: { + evaluate: (js: string) => Promise; + wait: (seconds: number) => Promise; + }, +): Promise<{ href: string; text: string }> { + let lastState = { href: '', text: '' }; + + for (let attempt = 0; attempt < 12; attempt++) { + lastState = await page.evaluate(`(() => ({ + href: String(location.href || ''), + text: String(document.body.innerText || '').replace(/\\s+/g, ' ').trim().slice(0, 4000), + }))()`); + + if ( + /\/wos\/author\/(summary|record)\//.test(lastState.href) + || lastState.text.includes('${AUTHOR_RESULTS_HINT}') + || /Search results/i.test(lastState.text) + ) { + return lastState; + } + + await page.wait(1); + } + + return lastState; +} + +async function applyAuthorSearchFilters( + page: { + evaluate: (js: string) => Promise; + wait: (seconds: number) => Promise; + }, + filters: AuthorSearchFilters, +): Promise { + const groups = [ + filters.claimedStatus ? { name: 'CLM', checkboxName: 'CLM', values: [filters.claimedStatus] } : null, + filters.authors.length ? { name: 'AU', checkboxName: 'AU', values: filters.authors } : null, + filters.affiliations.length ? { name: 'OG', checkboxName: 'OG', values: filters.affiliations } : null, + filters.countries.length ? { name: 'CU', checkboxName: 'CU', values: filters.countries } : null, + filters.categories.length ? { name: 'WC', checkboxName: 'WC', values: filters.categories } : null, + (filters.awardYears.length || filters.awardCategories.length) + ? { name: 'GRANTSAWARDED', checkboxName: 'FB', values: ['YES'] } + : null, + filters.awardYears.length ? { name: 'AY', checkboxName: 'AY', values: filters.awardYears } : null, + filters.awardCategories.length ? { name: 'AC', checkboxName: 'AC', values: filters.awardCategories } : null, + ].filter(Boolean) as Array<{ name: string; checkboxName: string; values: string[] }>; + + for (const group of groups) { + const result = await page.evaluate(`(() => { + const config = ${JSON.stringify({ + ...filters, + currentGroup: group, + })}; + const normalize = (text) => String(text || '').replace(/\\s+/g, ' ').trim().toLowerCase(); + const canonicalGroupValues = { + CLM: { + claimed: 'Claimed profiles', + unclaimed: 'Unclaimed profiles', + }, + GRANTSAWARDED: { + yes: 'Includes awarded grants', + true: 'Includes awarded grants', + }, + }; + const checkboxName = config.currentGroup.checkboxName || config.currentGroup.name; + const requested = Array.isArray(config.currentGroup.values) ? config.currentGroup.values : []; + const checkboxes = Array.from(document.querySelectorAll('input[type="checkbox"][name="' + checkboxName + '"]')); + const labelOf = (checkbox) => { + const aria = String(checkbox.getAttribute('aria-label') || '').trim(); + return aria.split('. ')[0].trim() || aria.split('.')[0].trim(); + }; + const valueOf = (checkbox) => { + const raw = String(checkbox.value || '').trim(); + const idx = raw.indexOf('.'); + return idx >= 0 ? raw.slice(idx + 1) : raw; + }; + const desiredLabels = requested.map((value) => { + const mapped = canonicalGroupValues[checkboxName]?.[String(value).trim().toLowerCase()]; + return mapped || value; + }); + const matches = (checkbox, target) => { + const label = normalize(labelOf(checkbox)); + const value = normalize(valueOf(checkbox)); + const needle = normalize(target); + return label === needle + || value === needle + || label.includes(needle) + || value.includes(needle); + }; + const findCheckbox = (target) => { + return checkboxes.find((checkbox) => matches(checkbox, target)) || null; + }; + const isRefineButton = (button) => { + const aria = normalize(button.getAttribute('aria-label')); + const text = normalize(button.textContent); + return aria.includes('refine button') || text === 'refine'; + }; + const findRefineButton = (checkbox) => { + let node = checkbox?.parentElement || null; + while (node && node !== document.body) { + const button = Array.from(node.querySelectorAll('button')).find((candidate) => isRefineButton(candidate)); + if (button) return button; + node = node.parentElement; + } + return Array.from(document.querySelectorAll('button')).find((candidate) => isRefineButton(candidate)) || null; + }; + + const missing = []; + let refineButton = null; + for (const desiredLabel of desiredLabels) { + const checkbox = findCheckbox(desiredLabel); + if (!checkbox) { + missing.push(desiredLabel); + continue; + } + refineButton ||= findRefineButton(checkbox); + if (!checkbox.checked) { + checkbox.click(); + } + } + + if (!missing.length && refineButton) { + refineButton.click(); + } + + return { missing, applied: desiredLabels.filter(label => !missing.includes(label)) }; + })()`); + + if (Array.isArray(result?.missing) && result.missing.length) { + if (['AY', 'AC'].includes(group.name)) { + continue; + } + throw new ArgumentError(`Web of Science researcher filter not found in current refine options: ${result.missing.join(', ')}`); + } + + await page.wait(4); + } +} + +async function scrapeAuthorResults(page: { + evaluate: (js: string) => Promise; +}): Promise> { + return page.evaluate(`(() => { + const normalize = (text) => String(text || '').replace(/\\s+/g, ' ').trim(); + const links = Array.from(document.querySelectorAll('a[href*="/wos/author/record/"]')); + const seen = new Set(); + const results = []; + for (const link of links) { + const href = link.getAttribute('href') || ''; + const absolute = href.startsWith('http') ? href : new URL(href, location.origin).toString(); + if (seen.has(absolute)) continue; + seen.add(absolute); + const card = link.closest('app-author-record, mat-card, article, li, [role="listitem"], .mat-mdc-card, .card, div'); + const name = normalize(link.textContent); + const text = normalize(card?.textContent || ''); + const details = text + .replace(name, '') + .slice(0, 280); + const infoLines = Array.from(card?.querySelectorAll?.('p.font-size-14:not(.meta-item)') || []) + .map((node) => normalize(node.textContent)) + .filter(Boolean); + const affiliations = infoLines + .filter((line) => line && !/, [A-Z]{2}, [A-Z]{3,}$/.test(line) && !/^[A-Z .'-]+, [A-Z]{2}, [A-Z]{3,}$/.test(line)); + const place = infoLines.find((line) => /, [A-Z]{2}, [A-Z]{3,}$/.test(line) || /^[A-Z .'-]+, [A-Z]{2}, [A-Z]{3,}$/.test(line)) || ''; + const researcherIdText = Array.from(card?.querySelectorAll?.('p.meta-item') || []) + .map((node) => normalize(node.textContent)) + .find((line) => line.startsWith('Web of Science ResearcherID')) || ''; + const researcherId = researcherIdText.replace(/^Web of Science ResearcherID/i, '').trim(); + const publishedNamesText = Array.from(card?.querySelectorAll?.('p.meta-item') || []) + .map((node) => normalize(node.textContent)) + .find((line) => line.startsWith('Published names')) || ''; + const publishedNames = Array.from(card?.querySelectorAll?.('.published-name span')) + .map((node) => normalize(node.textContent)) + .filter(Boolean); + const topJournalsText = Array.from(card?.querySelectorAll?.('p.meta-item') || []) + .map((node) => normalize(node.textContent)) + .find((line) => line.startsWith('Top Journals')) || ''; + const topJournals = topJournalsText + .replace(/^Top Journals/i, '') + .split(',') + .map((item) => normalize(item)) + .filter(Boolean); + if (name) { + results.push({ + name, + details, + affiliations, + location: place, + researcher_id: researcherId, + published_names: publishedNames, + top_journals: topJournals, + url: absolute, + }); + } + } + return results; + })()`) as Promise>; +} + +cli({ + site: 'webofscience', + name: 'author-search', + description: 'Search Web of Science researcher profiles', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'query', positional: true, required: true, help: 'Researcher name, e.g. Yann LeCun or LeCun, Yann' }, + { name: 'claimed-status', required: false, help: 'Refine by claimed or unclaimed profiles from the current result page', choices: ['claimed', 'unclaimed'] }, + { name: 'author', required: false, help: 'Comma- or semicolon-separated author facet values from the current result page' }, + { name: 'affiliation', required: false, help: 'Comma- or semicolon-separated affiliation facet values from the current result page' }, + { name: 'country', required: false, help: 'Comma- or semicolon-separated country/region facet values from the current result page' }, + { name: 'category', required: false, help: 'Comma- or semicolon-separated Web of Science category facet values from the current result page' }, + { name: 'award-year', required: false, help: 'Comma- or semicolon-separated award year facet values from the current result page' }, + { name: 'award-category', required: false, help: 'Comma- or semicolon-separated award category facet values from the current result page' }, + { name: 'limit', type: 'int', default: 10, help: 'Max results' }, + ], + columns: ['rank', 'name', 'affiliations', 'location', 'researcher_id', 'url'], + func: async (page, kwargs) => { + const query = String(kwargs.query ?? '').trim(); + if (!query) { + throw new ArgumentError('Search query is required'); + } + + const filters = normalizeAuthorSearchFilters(kwargs); + const limit = Math.max(1, Math.min(50, Number(kwargs.limit ?? 10) || 10)); + const { firstName, lastName } = splitAuthorQuery(query); + + await page.goto(AUTHOR_SEARCH_URL, { settleMs: 4000 }); + await page.wait(2); + await submitAuthorSearch(page, { firstName, lastName }); + + const landing = await waitForAuthorSearchLanding(page); + if (!/\/wos\/author\/(summary|record)\//.test(landing.href) && !landing.text.includes(AUTHOR_RESULTS_HINT)) { + throw new CommandExecutionError( + 'Web of Science researcher search did not reach a results page', + 'The author search form may still be waiting for autocomplete confirmation or passive verification.', + ); + } + + if (hasAuthorSearchFilters(filters)) { + if (/\/wos\/author\/record\//.test(landing.href)) { + throw new CommandExecutionError( + 'Web of Science opened a single researcher record before refine filters could be applied', + 'Broaden the query or remove the refine filters.', + ); + } + + await applyAuthorSearchFilters(page, filters); + } + + const scraped = await scrapeAuthorResults(page); + const rows = (Array.isArray(scraped) ? scraped : []) + .slice(0, limit) + .map((item, index) => ({ + rank: index + 1, + name: item.name ?? '', + details: item.details ?? '', + affiliations: Array.isArray(item.affiliations) ? item.affiliations : [], + location: item.location ?? '', + researcher_id: item.researcher_id ?? '', + published_names: Array.isArray(item.published_names) ? item.published_names : [], + top_journals: Array.isArray(item.top_journals) ? item.top_journals : [], + url: item.url ?? '', + })) + .filter(item => item.name); + + if (!rows.length) { + throw new EmptyResultError('webofscience author-search', 'Try a different researcher name or verify your Web of Science access in Chrome'); + } + + return rows; + }, +}); diff --git a/src/clis/webofscience/basic-search.test.ts b/src/clis/webofscience/basic-search.test.ts new file mode 100644 index 00000000..5a34b693 --- /dev/null +++ b/src/clis/webofscience/basic-search.test.ts @@ -0,0 +1,258 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import { buildBasicSearchRowText, isWosSubmitControl, normalizeBasicSearchField } from './shared.js'; +import './basic-search.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience basic-search', () => { + it('describes common field choices and the default field in command help', () => { + const cmd = getRegistry().get('webofscience/basic-search'); + const fieldArg = cmd?.args.find(arg => arg.name === 'field'); + + expect(fieldArg?.help).toContain('Default: topic'); + expect(fieldArg?.help).toContain('topic'); + expect(fieldArg?.help).toContain('title'); + expect(fieldArg?.help).toContain('author'); + expect(fieldArg?.help).toContain('doi'); + }); + + it('normalizes basic-search field aliases to official WOS tags', () => { + expect(normalizeBasicSearchField(undefined)).toMatchObject({ + key: 'topic', + label: 'Topic', + tag: 'TS', + }); + expect(normalizeBasicSearchField('title')).toMatchObject({ + key: 'title', + label: 'Title', + tag: 'TI', + }); + expect(normalizeBasicSearchField('all-fields')).toMatchObject({ + key: 'all_fields', + label: 'All Fields', + tag: 'ALL', + }); + expect(normalizeBasicSearchField('web-of-science-categories')).toMatchObject({ + key: 'web_of_science_categories', + label: 'Web of Science Categories', + tag: 'WC', + }); + }); + + it('reports supported field examples when an unsupported field is passed', () => { + expect(() => normalizeBasicSearchField('headline')).toThrow( + 'Unsupported Web of Science basic-search field: headline. Try one of: topic, title, author, doi, web-of-science-categories', + ); + }); + + it('builds rowText for basic-search fields using the mapped WOS tag', () => { + expect(buildBasicSearchRowText('machine learning', 'topic')).toBe('TS=(machine learning)'); + expect(buildBasicSearchRowText('machine learning', 'title')).toBe('TI=(machine learning)'); + expect(buildBasicSearchRowText('10.1016/j.patter.2024.101046', 'doi')).toBe('DO=(10.1016/j.patter.2024.101046)'); + expect(buildBasicSearchRowText('Yann LeCun', 'author')).toBe('AU=(Yann LeCun)'); + }); + + it('does not mistake history buttons for the actual search submit control', () => { + expect(isWosSubmitControl({ + text: 'search Search', + type: 'submit', + ariaLabel: null, + })).toBe(true); + + expect(isWosSubmitControl({ + text: 'View your search history', + type: 'button', + ariaLabel: null, + })).toBe(false); + }); + + it('uses the basic-search route and maps structured records', async () => { + const cmd = getRegistry().get('webofscience/basic-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SIDBASIC', href: 'https://webofscience.clarivate.cn/wos/alldb/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:101', + doi: '10.1000/basic', + titles: { + item: { en: [{ title: 'Basic search result' }] }, + source: { en: [{ title: 'BASIC JOURNAL' }] }, + }, + names: { + author: { + en: [{ wos_standard: 'Basic, A' }], + }, + }, + pub_info: { pubyear: '2025' }, + citation_related: { counts: { WOSCC: 5 } }, + }, + }, + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'basic', database: 'alldb', limit: 1, field: 'title' }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/alldb/basic-search', + { settleMs: 4000 }, + ); + expect(page.typeText).toHaveBeenCalledWith('#search-option-0', 'basic'); + const searchJs = vi.mocked(page.evaluate).mock.calls[1]?.[0]; + expect(searchJs).toContain('"rowText":"TI=(basic)"'); + expect(result).toEqual([ + { + rank: 1, + title: 'Basic search result', + authors: 'Basic, A', + year: '2025', + source: 'BASIC JOURNAL', + citations: 5, + doi: '10.1000/basic', + url: 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:101', + }, + ]); + }); + + it('prefers the stable basic-search textbox selector before generic discovery', async () => { + const cmd = getRegistry().get('webofscience/basic-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SIDPREFERRED', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:104', + titles: { + item: { en: [{ title: 'Preferred selector result' }] }, + }, + }, + }, + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'preferred field', limit: 1 }) as Array<{ title: string }>; + + expect(page.typeText).toHaveBeenCalledWith('#search-option-0', 'preferred field'); + expect(vi.mocked(page.evaluate).mock.calls[0]?.[0]).toContain("performance.getEntriesByType('resource')"); + expect(result[0]).toMatchObject({ title: 'Preferred selector result' }); + }); + + it('falls back to the visible basic-search submit button when the smart-search button is unavailable', async () => { + const cmd = getRegistry().get('webofscience/basic-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + 'opencli-search-submit', + { sid: 'SIDBUTTON', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:102', + titles: { + item: { en: [{ title: 'Button submit result' }] }, + }, + }, + }, + }, + ], + ]); + vi.mocked(page.click).mockRejectedValueOnce(new Error('Element not found')); + + const result = await cmd!.func!(page, { query: 'button path', limit: 1 }) as Array<{ title: string }>; + + const submitDiscoveryJs = vi.mocked(page.evaluate).mock.calls[0]?.[0]; + expect(submitDiscoveryJs).toContain("const submitRef = 'opencli-search-submit'"); + expect(submitDiscoveryJs).toContain("target.setAttribute('data-ref', submitRef)"); + expect(page.click).toHaveBeenNthCalledWith(2, 'opencli-search-submit'); + expect(page.pressKey).not.toHaveBeenCalled(); + expect(result[0]).toMatchObject({ title: 'Button submit result' }); + }); + + it('retries input discovery when the basic-search field renders late', async () => { + const cmd = getRegistry().get('webofscience/basic-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + null, + 'opencli-search-input', + { sid: 'SIDLATE', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:103', + titles: { + item: { en: [{ title: 'Late field result' }] }, + }, + }, + }, + }, + ], + ]); + vi.mocked(page.typeText).mockRejectedValueOnce(new Error('Not ready')); + + const result = await cmd!.func!(page, { query: 'late field', limit: 1 }) as Array<{ title: string }>; + + expect(vi.mocked(page.evaluate).mock.calls[0]?.[0]).toContain('document.querySelectorAll(\'input, textarea\')'); + expect(vi.mocked(page.evaluate).mock.calls[1]?.[0]).toContain('document.querySelectorAll(\'input, textarea\')'); + expect(result[0]).toMatchObject({ title: 'Late field result' }); + }); + + it('throws EmptyResultError when no records are returned', async () => { + const cmd = getRegistry().get('webofscience/basic-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SIDEMPTY', href: 'https://webofscience.clarivate.cn/wos/woscc/basic-search' }, + [{ key: 'records', payload: {} }], + ]); + + await expect(cmd!.func!(page, { query: 'none' })).rejects.toThrow(EmptyResultError); + }); +}); diff --git a/src/clis/webofscience/basic-search.ts b/src/clis/webofscience/basic-search.ts new file mode 100644 index 00000000..9cfa40f7 --- /dev/null +++ b/src/clis/webofscience/basic-search.ts @@ -0,0 +1,108 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { + basicSearchUrl, + buildBasicSearchRowText, + buildSearchPayload, + clampLimit, + ensureSearchSessionAtUrl, + extractRecords, + firstTitle, + formatAuthors, + fullRecordUrl, + normalizeDatabase, + normalizeBasicSearchField, +} from './shared.js'; + +const BASIC_SEARCH_INPUT_SELECTOR = '#search-option-0'; + +cli({ + site: 'webofscience', + name: 'basic-search', + description: 'Search Web of Science via the Basic Search page', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'query', positional: true, required: true, help: 'Search query text, e.g. machine learning' }, + { + name: 'field', + required: false, + help: 'Field to search in. Default: topic. Common: topic, title, author, doi, WOS categories', + choices: [ + 'all-fields', + 'topic', + 'title', + 'author', + 'publication-titles', + 'year-published', + 'affiliation', + 'funding-agency', + 'publisher', + 'publication-date', + 'abstract', + 'accession-number', + 'address', + 'author-identifiers', + 'author-keywords', + 'conference', + 'document-type', + 'doi', + 'editor', + 'grant-number', + 'group-author', + 'keyword-plus', + 'language', + 'pubmed-id', + 'web-of-science-categories', + ], + }, + { name: 'database', required: false, help: 'Database to search. Defaults to woscc.', choices: ['woscc', 'alldb'] }, + { name: 'limit', type: 'int', default: 10, help: 'Max results (max 50)' }, + ], + columns: ['rank', 'title', 'authors', 'year', 'source', 'citations', 'doi', 'url'], + func: async (page, kwargs) => { + const query = String(kwargs.query ?? '').trim(); + if (!query) { + throw new ArgumentError('Search query is required'); + } + + const database = normalizeDatabase(kwargs.database); + const limit = clampLimit(kwargs.limit); + const field = normalizeBasicSearchField(kwargs.field); + const sid = await ensureSearchSessionAtUrl(page, basicSearchUrl(database), query, BASIC_SEARCH_INPUT_SELECTOR); + const payload = buildSearchPayload(query, limit, database, buildBasicSearchRowText(query, field.key)); + + const events = await page.evaluate(`(async () => { + const payload = ${JSON.stringify(payload)}; + const res = await fetch('/api/wosnx/core/runQuerySearch?SID=' + encodeURIComponent(${JSON.stringify(sid)}), { + method: 'POST', + credentials: 'include', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(payload) + }); + return res.json(); + })()`); + + const records = extractRecords(events) + .slice(0, limit) + .map((record, index) => ({ + rank: index + 1, + title: firstTitle(record, 'item'), + authors: formatAuthors(record), + year: record.pub_info?.pubyear ?? '', + source: firstTitle(record, 'source'), + citations: record.citation_related?.counts?.WOSCC ?? 0, + doi: record.doi ?? '', + url: record.ut ? fullRecordUrl(database, record.ut) : '', + })) + .filter(record => record.title); + + if (!records.length) { + throw new EmptyResultError('webofscience basic-search', 'Try a different keyword or verify your Web of Science access in Chrome'); + } + + return records; + }, +}); diff --git a/src/clis/webofscience/citing-articles.test.ts b/src/clis/webofscience/citing-articles.test.ts new file mode 100644 index 00000000..9a7ce525 --- /dev/null +++ b/src/clis/webofscience/citing-articles.test.ts @@ -0,0 +1,108 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import './citing-articles.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience citing-articles', () => { + it('describes citing lookup identifiers and database inference in command help', () => { + const cmd = getRegistry().get('webofscience/citing-articles'); + const idArg = cmd?.args.find(arg => arg.name === 'id'); + const databaseArg = cmd?.args.find(arg => arg.name === 'database'); + + expect(idArg?.help).toContain('WOS:'); + expect(idArg?.help).toContain('DOI'); + expect(idArg?.help).toContain('full-record URL'); + expect(databaseArg?.help).toContain('Defaults to the database in the URL'); + }); + + it('loads a citing summary via the records stream endpoint', async () => { + const cmd = getRegistry().get('webofscience/citing-articles'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + streamText: [ + '{"id":0,"key":"searchInfo","payload":{"QueryID":"QIDCITING","RecordsFound":64}}', + '{"api":"runQueryGetRecordsStream","id":1,"key":"records","payload":{"1":{"ut":"WOS:002","doi":"10.1000/citing.1","titles":{"item":{"en":[{"title":"Citing article one"}]},"source":{"en":[{"title":"NATURE"}]}},"names":{"author":{"en":[{"wos_standard":"Smith, J"}]}},"pub_info":{"pubyear":"2026"},"citation_related":{"counts":{"WOSCC":12}}}}}', + ].join('\n'), + debug: {}, + }, + ]); + + const result = await cmd!.func!(page, { id: 'WOS:001335131500001', limit: 1 }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:001335131500001', + { settleMs: 5000 }, + ); + + const navigateJs = vi.mocked(page.evaluate).mock.calls[0]?.[0]; + expect(navigateJs).toContain('location.href'); + expect(navigateJs).toContain('citing-summary/WOS:001335131500001'); + + const fetchJs = vi.mocked(page.evaluate).mock.calls[1]?.[0]; + expect(fetchJs).toContain(`localStorage.getItem('wos_search_' + qid)`); + expect(fetchJs).toContain(`searchState?.mode || "citing_article"`); + expect(fetchJs).toContain(`/api/wosnx/core/runQueryGetRecordsStream?SID=`); + + expect(result).toEqual([ + { + rank: 1, + title: 'Citing article one', + authors: 'Smith, J', + year: '2026', + source: 'NATURE', + citations: 12, + doi: '10.1000/citing.1', + url: 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:002', + }, + ]); + }); + + it('throws EmptyResultError when the citing summary response has no records', async () => { + const cmd = getRegistry().get('webofscience/citing-articles'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { streamText: '', debug: {} }, + { streamText: '', debug: {} }, + ]); + + await expect(cmd!.func!(page, { id: 'WOS:001335131500001' })).rejects.toThrow(EmptyResultError); + }); +}); diff --git a/src/clis/webofscience/citing-articles.ts b/src/clis/webofscience/citing-articles.ts new file mode 100644 index 00000000..d65f26ed --- /dev/null +++ b/src/clis/webofscience/citing-articles.ts @@ -0,0 +1,107 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { + buildExactQuery, + buildSearchPayload, + citingSummaryUrl, + clampLimit, + ensureSearchSession, + extractRecords, + fetchCurrentSummaryStreamRecords, + findMatchingRecord, + firstTitle, + formatAuthors, + fullRecordUrl, + normalizeDatabase, + parseRecordIdentifier, +} from './shared.js'; + +async function resolveUt( + page: { + evaluate: (js: string) => Promise; + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + typeText: (selector: string, text: string) => Promise; + click: (selector: string) => Promise; + pressKey: (key: string) => Promise; + }, + rawId: string, + database: 'woscc' | 'alldb', +): Promise { + const identifier = parseRecordIdentifier(rawId); + if (!identifier) { + throw new ArgumentError('Record identifier must be a Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046'); + } + if (identifier.kind === 'ut') return identifier.value; + + const sid = await ensureSearchSession(page, database, rawId); + const events = await page.evaluate(`(async () => { + const payload = ${JSON.stringify(buildSearchPayload(rawId, 5, database, buildExactQuery(identifier)))}; + const res = await fetch('/api/wosnx/core/runQuerySearch?SID=' + encodeURIComponent(${JSON.stringify(sid)}), { + method: 'POST', + credentials: 'include', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(payload), + }); + return res.json(); + })()`); + const match = findMatchingRecord(extractRecords(events), identifier); + if (!match?.record?.ut) { + throw new EmptyResultError('webofscience citing-articles', 'Try using a Web of Science UT or full-record URL.'); + } + return match.record.ut; +} + +cli({ + site: 'webofscience', + name: 'citing-articles', + description: 'List articles citing a Web of Science record', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'id', positional: true, required: true, help: 'Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046' }, + { name: 'database', required: false, help: 'Database to use. Defaults to the database in the URL, otherwise woscc.', choices: ['woscc', 'alldb'] }, + { name: 'limit', type: 'int', default: 10, help: 'Max results (max 50)' }, + ], + columns: ['rank', 'title', 'authors', 'year', 'source', 'citations', 'doi', 'url'], + func: async (page, kwargs) => { + const rawId = String(kwargs.id ?? '').trim(); + if (!rawId) throw new ArgumentError('Record identifier is required'); + + const identifier = parseRecordIdentifier(rawId); + if (!identifier) { + throw new ArgumentError('Record identifier must be a Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046'); + } + + const database = normalizeDatabase(kwargs.database, identifier.database ?? 'woscc'); + const limit = clampLimit(kwargs.limit); + const ut = await resolveUt(page, rawId, database); + const summaryUrl = citingSummaryUrl(database, ut); + await page.goto(fullRecordUrl(database, ut), { settleMs: 5000 }); + await page.wait(4); + await page.evaluate(`(() => { location.href = ${JSON.stringify(summaryUrl)}; return true; })()`); + const records = fetchCurrentSummaryStreamRecords(page, database, limit, 'citing_article'); + + const rows = (await records) + .slice(0, limit) + .map((record, index) => ({ + rank: index + 1, + title: firstTitle(record, 'item'), + authors: formatAuthors(record), + year: record.pub_info?.pubyear ?? '', + source: firstTitle(record, 'source'), + citations: record.citation_related?.counts?.WOSCC ?? 0, + doi: record.doi ?? '', + url: record.ut ? fullRecordUrl(database, record.ut) : '', + })) + .filter(row => row.title); + + if (!rows.length) { + throw new EmptyResultError('webofscience citing-articles', 'Try opening the citing summary in Chrome once, then run again.'); + } + + return rows; + }, +}); diff --git a/src/clis/webofscience/record.test.ts b/src/clis/webofscience/record.test.ts new file mode 100644 index 00000000..81310734 --- /dev/null +++ b/src/clis/webofscience/record.test.ts @@ -0,0 +1,738 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import { extractSupplementMetadataFromText } from './record.js'; +import './record.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience record', () => { + it('describes supported record identifiers and database inference in command help', () => { + const cmd = getRegistry().get('webofscience/record'); + const idArg = cmd?.args.find(arg => arg.name === 'id'); + const databaseArg = cmd?.args.find(arg => arg.name === 'database'); + + expect(idArg?.help).toContain('WOS:'); + expect(idArg?.help).toContain('DOI'); + expect(idArg?.help).toContain('full-record URL'); + expect(databaseArg?.help).toContain('Defaults to the database in the URL'); + }); + + it('extracts structured metadata from full-record page text blocks', () => { + const body = `Keywords +Keywords PlusNEURAL-NETWORKSSELECTION +Author Information +Corresponding Address +Lones, Michael A. +(corresponding author) +arrow_drop_down +Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +E-mail Addresses +m.lones@hw.ac.uk +Addresses +arrow_drop_down +1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +E-mail Addresses +m.lones@hw.ac.uk +Categories/ Classification +Research AreasComputer Science +Citation Topics +6 Social Sciences +Web of Science Categories +Computer Science, Artificial IntelligenceComputer Science, Information SystemsComputer Science, Interdisciplinary Applications +add +See more data fields +Journal information +PATTERNS +Research Areas +Computer Science +Web of Science Categories +Computer Science, Artificial IntelligenceComputer Science, Information SystemsComputer Science, Interdisciplinary Applications Language English Accession Number WOS:001335131500001 PubMed ID 39569205 +7.4`; + + expect(extractSupplementMetadataFromText(body)).toMatchObject({ + corresponding_address: 'Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland', + author_addresses: '1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland', + email_addresses: 'm.lones@hw.ac.uk', + research_areas: 'Computer Science', + wos_categories: 'Computer Science, Artificial Intelligence; Computer Science, Information Systems; Computer Science, Interdisciplinary Applications', + }); + }); + + it('extracts author-level affiliation references from full-record page text', () => { + const body = `Author Information +By +Lones, Michael A.1,2 +Doe, Jane3 +Addresses +1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +2 National Robotarium, Edinburgh, Scotland +3 Example University, School of Computing, Boston, MA, USA +E-mail Addresses +m.lones@hw.ac.uk +jane@example.edu`; + + expect(extractSupplementMetadataFromText(body)).toMatchObject({ + authors_structured: JSON.stringify([ + { + name: 'Lones, Michael A.', + address_refs: ['1', '2'], + addresses: [ + 'Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland', + 'National Robotarium, Edinburgh, Scotland', + ], + }, + { + name: 'Doe, Jane', + address_refs: ['3'], + addresses: [ + 'Example University, School of Computing, Boston, MA, USA', + ], + }, + ]), + }); + }); + + it('strips trailing metadata labels from inline wos categories text', () => { + const body = `Web of Science Categories +Computer Science, Artificial IntelligenceComputer Science, Information SystemsComputer Science, Interdisciplinary Applications Language English Accession Number WOS:001335131500001 PubMed ID 39569205`; + + expect(extractSupplementMetadataFromText(body)).toMatchObject({ + wos_categories: 'Computer Science, Artificial Intelligence; Computer Science, Information Systems; Computer Science, Interdisciplinary Applications', + }); + }); + + it('extracts inline metadata and keyword sections from full-record text when API fields are missing', () => { + const body = `Keywords +Author Keywords +machine learning +best practices +Keywords Plus +NEURAL NETWORKS +SELECTION +Author Information +By +Lones, Michael A. +Corresponding Address +Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +E-mail Addresses +m.lones@hw.ac.uk +Addresses +1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +Categories/ Classification +Research Areas +Computer Science +Web of Science Categories +Computer Science, Artificial Intelligence +Computer Science, Information Systems +Language +English +Accession Number +WOS:001335131500001 +PubMed ID +39569205 +ISSN +2666-3899 +IDS Number +J1Z8Y +Journal information +Current Publisher +CELL PRESS +50 HAMPSHIRE ST, FLOOR 5, CAMBRIDGE, MA 02139 +Journal Impact Factor`; + + expect(extractSupplementMetadataFromText(body)).toMatchObject({ + author_keywords: 'machine learning; best practices', + keywords_plus: 'NEURAL NETWORKS; SELECTION', + language: 'English', + pubmed_id: '39569205', + issn: '2666-3899', + ids_number: 'J1Z8Y', + current_publisher: 'CELL PRESS; 50 HAMPSHIRE ST, FLOOR 5, CAMBRIDGE, MA 02139', + authors_structured: JSON.stringify([ + { + name: 'Lones, Michael A.', + address_refs: [], + addresses: [], + }, + ]), + }); + }); + + it('fetches a full record by UT using the ALLDB database when provided', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SID555', href: 'https://webofscience.clarivate.cn/wos/alldb/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QID555', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:001335131500001', + doi: '10.1016/j.patter.2024.101046', + coll: 'WOSCC', + titles: { + item: { en: [{ title: 'Avoiding common machine learning pitfalls' }] }, + source: { en: [{ title: 'PATTERNS' }] }, + }, + names: { + author: { + en: [ + { wos_standard: 'Lones, M A' }, + { wos_standard: 'Doe, J' }, + ], + }, + }, + pub_info: { pubyear: '2024' }, + citation_related: { counts: { WOSCC: 64, ALLDB: 81 } }, + }, + }, + }, + ], + [ + { + key: 'full-record', + payload: { + ut: 'WOS:001335131500001', + doi: '10.1016/j.patter.2024.101046', + coll: 'WOSCC', + titles: { + item: { en: [{ title: 'Avoiding common machine learning pitfalls' }] }, + source: { en: [{ title: 'PATTERNS' }] }, + }, + names: { + author: { + en: [ + { wos_standard: 'Lones, M A' }, + { wos_standard: 'Doe, J' }, + ], + }, + }, + pub_info: { + pubyear: '2024', + sortdate: '2024-09-01', + }, + abstract: { + basic: { + en: { + abstract: '

A concise abstract for testing.

', + }, + }, + }, + keywords: { + author_keywords: { + en: [{ keyword: 'machine learning' }, { keyword: 'best practices' }], + }, + keywords_plus: { + en: [{ keyword: 'pitfalls' }], + }, + }, + citation_related: { + counts: { + WOSCC: 64, + ALLDB: 81, + }, + }, + }, + }, + ], + { + metadata: { + document_type: 'Review', + article_number: '101046', + published: 'OCT 11 2024', + early_access: 'OCT 2024', + indexed: '2024-10-25', + language: 'English', + pubmed_id: '39569205', + issn: '2666-3899', + ids_number: 'J1Z8Y', + corresponding_address: 'Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland', + author_addresses: '1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland', + email_addresses: 'm.lones@hw.ac.uk', + research_areas: 'Computer Science', + wos_categories: 'Computer Science, Artificial Intelligence; Computer Science, Information Systems; Computer Science, Interdisciplinary Applications', + authors_structured: JSON.stringify([ + { + name: 'Lones, Michael A.', + address_refs: ['1'], + addresses: ['Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland'], + }, + ]), + current_publisher: 'CELL PRESS50 HAMPSHIRE ST, FLOOR 5, CAMBRIDGE, MA 02139', + cited_references: '71', + }, + fullTextLinks: [ + { + label: 'Context Sensitive Links', + url: 'https://webofscience.clarivate.cn/api/gateway?foo=1', + }, + { + label: 'Free Submitted Article From Repository', + url: 'https://pmc.ncbi.nlm.nih.gov/articles/PMC11573893/pdf/main.pdf', + }, + ], + }, + ]); + + const result = await cmd!.func!(page, { id: 'WOS:001335131500001', database: 'alldb' }); + + expect(page.goto).toHaveBeenNthCalledWith(1, + 'https://webofscience.clarivate.cn/wos/alldb/smart-search', + { settleMs: 4000 }, + ); + expect(page.goto).toHaveBeenNthCalledWith(2, + 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:001335131500001', + { settleMs: 4000 }, + ); + + const searchJs = vi.mocked(page.evaluate).mock.calls[1]?.[0]; + expect(searchJs).toContain('"rowText":"UT=(WOS:001335131500001)"'); + expect(searchJs).toContain('"product":"ALLDB"'); + + const fullRecordJs = vi.mocked(page.evaluate).mock.calls[2]?.[0]; + expect(fullRecordJs).toContain('/api/wosnx/core/getFullRecordByQueryId?SID='); + expect(fullRecordJs).toContain('"qid":"QID555"'); + expect(fullRecordJs).toContain('"id":1'); + expect(fullRecordJs).toContain('"product":"ALLDB"'); + expect(fullRecordJs).toContain('"searchMode":"general_semantic"'); + + expect(result).toEqual([ + { field: 'title', value: 'Avoiding common machine learning pitfalls' }, + { field: 'authors', value: 'Lones, M A; Doe, J' }, + { field: 'year', value: '2024' }, + { field: 'source', value: 'PATTERNS' }, + { field: 'doi', value: '10.1016/j.patter.2024.101046' }, + { field: 'ut', value: 'WOS:001335131500001' }, + { field: 'abstract', value: 'A concise abstract for testing.' }, + { field: 'document_type', value: 'Review' }, + { field: 'article_number', value: '101046' }, + { field: 'published', value: 'OCT 11 2024' }, + { field: 'early_access', value: 'OCT 2024' }, + { field: 'indexed', value: '2024-10-25' }, + { field: 'language', value: 'English' }, + { field: 'pubmed_id', value: '39569205' }, + { field: 'issn', value: '2666-3899' }, + { field: 'ids_number', value: 'J1Z8Y' }, + { field: 'corresponding_address', value: 'Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland' }, + { field: 'author_addresses', value: '1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland' }, + { field: 'email_addresses', value: 'm.lones@hw.ac.uk' }, + { field: 'research_areas', value: 'Computer Science' }, + { field: 'wos_categories', value: 'Computer Science, Artificial Intelligence; Computer Science, Information Systems; Computer Science, Interdisciplinary Applications' }, + { field: 'authors_structured', value: JSON.stringify([ + { + name: 'Lones, Michael A.', + address_refs: ['1'], + addresses: ['Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland'], + }, + ]) }, + { field: 'current_publisher', value: 'CELL PRESS50 HAMPSHIRE ST, FLOOR 5, CAMBRIDGE, MA 02139' }, + { field: 'author_keywords', value: 'machine learning; best practices' }, + { field: 'keywords_plus', value: 'pitfalls' }, + { field: 'citations_woscc', value: '64' }, + { field: 'citations_alldb', value: '81' }, + { field: 'cited_references', value: '71' }, + { field: 'full_text_links', value: 'Context Sensitive Links; Free Submitted Article From Repository' }, + { field: 'full_text_urls', value: 'https://webofscience.clarivate.cn/api/gateway?foo=1; https://pmc.ncbi.nlm.nih.gov/articles/PMC11573893/pdf/main.pdf' }, + { field: 'url', value: 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:001335131500001' }, + ]); + }); + + it('retries supplement scraping when the first full-record page scrape is empty', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SIDRETRY', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QIDRETRY', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:RETRY1', + titles: { + item: { en: [{ title: 'Retry supplement result' }] }, + }, + citation_related: { counts: { WOSCC: 1 } }, + }, + }, + }, + ], + [ + { + key: 'full-record', + payload: { + ut: 'WOS:RETRY1', + titles: { + item: { en: [{ title: 'Retry supplement result' }] }, + }, + citation_related: { counts: { WOSCC: 1 } }, + }, + }, + ], + {}, + { + bodyText: `Document Type +Review +Abstract +Current Publisher +Retry Publisher +Journal Impact Factor`, + fullTextLinks: [], + }, + ]); + + const result = await cmd!.func!(page, { id: 'WOS:RETRY1' }) as Array<{ field: string; value: string }>; + + expect(page.goto).toHaveBeenNthCalledWith( + 2, + 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:RETRY1', + { settleMs: 4000 }, + ); + expect(page.goto).toHaveBeenNthCalledWith( + 3, + 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:RETRY1', + { settleMs: 4000 }, + ); + expect(result).toContainEqual({ field: 'document_type', value: 'Review' }); + expect(result).toContainEqual({ field: 'current_publisher', value: 'Retry Publisher' }); + }); + + it('accepts a full-record URL and infers the database from the path', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SID777', href: 'https://webofscience.clarivate.cn/wos/alldb/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QID777', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:009999999999999', + coll: 'WOSCC', + titles: { + item: { en: [{ title: 'URL input record' }] }, + }, + }, + }, + }, + ], + [ + { + key: 'full-record', + payload: { + ut: 'WOS:009999999999999', + titles: { + item: { en: [{ title: 'URL input record' }] }, + }, + }, + }, + ], + ]); + + const result = await cmd!.func!(page, { + id: 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:009999999999999', + }) as Array<{ field: string; value: string }>; + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/alldb/smart-search', + { settleMs: 4000 }, + ); + expect(result[0]).toEqual({ field: 'title', value: 'URL input record' }); + }); + + it('throws for an unsupported record identifier', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([]); + await expect(cmd!.func!(page, { id: 'not-a-record' })).rejects.toThrow(ArgumentError); + }); + + it('throws EmptyResultError when the exact record cannot be found', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SID404', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QID404', + RecordsFound: 0, + }, + }, + { + key: 'records', + payload: {}, + }, + ], + ]); + + await expect(cmd!.func!(page, { id: 'WOS:001404' })).rejects.toThrow(EmptyResultError); + }); + + it('falls back to Enter when the submit button is unavailable', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + null, + { sid: 'SIDENTER', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QIDENTER', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:003', + titles: { + item: { en: [{ title: 'Enter fallback record' }] }, + }, + }, + }, + }, + ], + [ + { + key: 'full-record', + payload: { + ut: 'WOS:003', + titles: { + item: { en: [{ title: 'Enter fallback record' }] }, + }, + }, + }, + ], + ]); + vi.mocked(page.click).mockRejectedValueOnce(new Error('Element not found')); + + const result = await cmd!.func!(page, { id: 'WOS:003' }); + + expect(page.pressKey).toHaveBeenCalledWith('Enter'); + expect(result).toBeTruthy(); + }); + + it('falls back to the matched search record when full-record fetch fails', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SIDFB', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QIDFB', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:004', + doi: '10.1000/fallback', + titles: { + item: { en: [{ title: 'Fallback summary record' }] }, + source: { en: [{ title: 'SUMMARY SOURCE' }] }, + }, + names: { + author: { + en: [{ wos_standard: 'Fallback, A' }], + }, + }, + pub_info: { pubyear: '2023' }, + citation_related: { + counts: { + WOSCC: 9, + }, + }, + }, + }, + }, + ], + ]); + vi.mocked(page.evaluate).mockRejectedValueOnce(new Error('Unexpected token <')); + + const result = await cmd!.func!(page, { id: 'WOS:004' }); + + expect(result).toEqual([ + { field: 'title', value: 'Fallback summary record' }, + { field: 'authors', value: 'Fallback, A' }, + { field: 'year', value: '2023' }, + { field: 'source', value: 'SUMMARY SOURCE' }, + { field: 'doi', value: '10.1000/fallback' }, + { field: 'ut', value: 'WOS:004' }, + { field: 'citations_woscc', value: '9' }, + { field: 'url', value: 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:004' }, + ]); + }); + + it('falls back to page metadata for keyword and identifier fields when the API payload omits them', async () => { + const cmd = getRegistry().get('webofscience/record'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SIDMETA', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QIDMETA', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:001335131500001', + titles: { + item: { en: [{ title: 'Metadata fallback record' }] }, + }, + citation_related: { counts: { WOSCC: 64, ALLDB: 67 } }, + }, + }, + }, + ], + [ + { + key: 'full-record', + payload: { + ut: 'WOS:001335131500001', + titles: { + item: { en: [{ title: 'Metadata fallback record' }] }, + }, + citation_related: { counts: { WOSCC: 64, ALLDB: 67 } }, + }, + }, + ], + { + bodyText: `Keywords +Author Keywords +machine learning +best practices +Keywords Plus +NEURAL NETWORKS +SELECTION +Author Information +By +Lones, Michael A. +Corresponding Address +Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +E-mail Addresses +m.lones@hw.ac.uk +Addresses +1 Heriot Watt Univ, Sch Math & Comp Sci, Edinburgh, Scotland +Categories/ Classification +Research Areas +Computer Science +Web of Science Categories +Computer Science, Artificial Intelligence +Computer Science, Information Systems +Language +English +Accession Number +WOS:001335131500001 +PubMed ID +39569205 +ISSN +2666-3899 +IDS Number +J1Z8Y +Journal information +Current Publisher +CELL PRESS +50 HAMPSHIRE ST, FLOOR 5, CAMBRIDGE, MA 02139 +Journal Impact Factor`, + fullTextLinks: [], + }, + ]); + + const result = await cmd!.func!(page, { id: 'WOS:001335131500001' }) as Array<{ field: string; value: string }>; + + expect(result).toContainEqual({ field: 'author_keywords', value: 'machine learning; best practices' }); + expect(result).toContainEqual({ field: 'keywords_plus', value: 'NEURAL NETWORKS; SELECTION' }); + expect(result).toContainEqual({ field: 'language', value: 'English' }); + expect(result).toContainEqual({ field: 'pubmed_id', value: '39569205' }); + expect(result).toContainEqual({ field: 'issn', value: '2666-3899' }); + expect(result).toContainEqual({ field: 'ids_number', value: 'J1Z8Y' }); + expect(result).toContainEqual({ + field: 'current_publisher', + value: 'CELL PRESS; 50 HAMPSHIRE ST, FLOOR 5, CAMBRIDGE, MA 02139', + }); + expect(result).toContainEqual({ + field: 'authors_structured', + value: JSON.stringify([ + { + name: 'Lones, Michael A.', + address_refs: [], + addresses: [], + }, + ]), + }); + }); +}); diff --git a/src/clis/webofscience/record.ts b/src/clis/webofscience/record.ts new file mode 100644 index 00000000..e8fa3771 --- /dev/null +++ b/src/clis/webofscience/record.ts @@ -0,0 +1,617 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, CommandExecutionError, EmptyResultError } from '../../errors.js'; +import { + buildExactQuery, + buildFullRecordPayload, + buildSearchPayload, + ensureSearchSession, + extractAbstract, + extractFullRecord, + extractKeywordGroup, + extractQueryId, + extractRecords, + findMatchingRecord, + firstTitle, + formatAuthors, + fullRecordUrl, + normalizeDatabase, + parseRecordIdentifier, + toProduct, +} from './shared.js'; + +type RecordPageSupplement = { + metadata?: Record; + fullTextLinks?: Array<{ label?: string; url?: string }>; +}; + +const UI_NOISE_LINES = new Set([ + 'arrow_drop_down', + 'arrow_back', + 'arrow_forward', + 'chevron_right', + 'add', +]); + +const SECTION_LABELS = new Set([ + 'Keywords', + 'Author Information', + 'Corresponding Address', + 'E-mail Addresses', + 'Addresses', + 'Categories/ Classification', + 'Research Areas', + 'Citation Topics', + 'Web of Science Categories', + 'Journal information', + 'View Journal Impact', + 'ISSN', + 'Current Publisher', + 'Journal Impact Factor', + 'Journal Citation Reports TM', + 'Citation Network', +]); + +function normalizeTextValue(value: string): string { + return value + .replace(/\u00a0/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +function getTextLines(body: string): string[] { + return body + .replace(/\u00a0/g, ' ') + .split('\n') + .map(line => line.trim()) + .filter(Boolean); +} + +function isSectionBoundary(line: string, extraLabels: string[] = []): boolean { + if (SECTION_LABELS.has(line)) return true; + if (extraLabels.includes(line)) return true; + if (/^See more/i.test(line)) return true; + if (/^How does this document/i.test(line)) return true; + return false; +} + +function extractSectionLines(body: string, label: string, endLabels: string[] = []): string[] { + const lines = getTextLines(body); + const startIndex = lines.findIndex(line => line === label); + if (startIndex < 0) return []; + + const values: string[] = []; + for (let index = startIndex + 1; index < lines.length; index++) { + const line = lines[index]; + if (UI_NOISE_LINES.has(line)) continue; + if (isSectionBoundary(line, endLabels)) break; + values.push(line); + } + return values; +} + +function extractInlineOrSectionValue(body: string, label: string, endLabels: string[] = []): string { + const lines = getTextLines(body); + for (const [index, line] of lines.entries()) { + if (line === label) { + const values = extractSectionLines(body, label, endLabels); + return normalizeTextValue(values.join(' ')); + } + if (line.startsWith(label)) { + const inline = normalizeTextValue(line.slice(label.length)); + if (inline) return inline; + for (let next = index + 1; next < lines.length; next++) { + const candidate = lines[next]; + if (UI_NOISE_LINES.has(candidate)) continue; + if (isSectionBoundary(candidate, endLabels)) break; + if (candidate) return normalizeTextValue(candidate); + } + } + } + return ''; +} + +function uniqueValues(values: string[]): string[] { + const seen = new Set(); + const result: string[] = []; + for (const value of values.map(normalizeTextValue).filter(Boolean)) { + if (seen.has(value)) continue; + seen.add(value); + result.push(value); + } + return result; +} + +function stripTrailingMetadataLabels(value: string): string { + const normalized = normalizeTextValue(value); + if (!normalized) return ''; + + const trailingLabelPattern = /\s(?:Language|Accession Number|PubMed ID|ISSN|IDS Number)\b/i; + const match = normalized.match(trailingLabelPattern); + return match?.index != null + ? normalized.slice(0, match.index).trim() + : normalized; +} + +function normalizeDelimitedList(value: string): string { + const normalized = normalizeTextValue(value); + if (!normalized) return ''; + + return normalized + .replace(/([a-z)])(?=[A-Z][a-z])/g, '$1; ') + .replace(/([a-z)])\s+(?=[A-Z][a-z].*?,)/g, '$1; ') + .replace(/;\s*;/g, '; ') + .trim(); +} + +function extractCategoryList(value: string): string { + const normalized = normalizeTextValue(value); + if (!normalized) return ''; + + const matches = normalized.match(/[A-Z][A-Za-z&/\-]+(?:\s+[A-Z][A-Za-z&/\-]+)*,\s+[A-Z][A-Za-z&/\-]+(?:\s+[A-Z][A-Za-z&/\-]+)*?(?=(?:[A-Z][A-Za-z&/\-]+(?:\s+[A-Z][A-Za-z&/\-]+)*,\s+[A-Z])|$)/g); + if (matches?.length) { + return uniqueValues(matches.map(normalizeTextValue)).join('; '); + } + + return normalizeDelimitedList(normalized); +} + +function cleanAuthorLine(value: string): { name: string; refs: string[] } | null { + const normalized = normalizeTextValue(value); + if (!normalized) return null; + if (!/,/.test(normalized)) return null; + if (/\b(view|provided|source|arrow|journal|impact|publisher)\b/i.test(normalized)) return null; + + const refs = Array.from(normalized.matchAll(/\[(\d+(?:,\d+)*)\]/g)) + .flatMap(match => String(match[1] || '').split(',')) + .map(item => item.trim()) + .filter(Boolean); + const parenthetical = normalized.match(/\(([^()]+,[^()]+)\)/)?.[1]; + const cleaned = normalizeTextValue( + (parenthetical || normalized) + .replace(/\[[^\]]+\]/g, ' ') + .replace(/\([^()]*\)/g, parenthetical ? ' ' : '') + .replace(/\s+\d+(?:,\d+)*$/g, ' ') + ); + + if (!cleaned || /\b(corresponding author)\b/i.test(cleaned)) return null; + return { name: cleaned, refs }; +} + +function extractSectionValueList(body: string, label: string, endLabels: string[] = []): string[] { + const values = extractSectionLines(body, label, endLabels) + .flatMap((line) => normalizeDelimitedList(line).split(/\s*;\s*/g)) + .map(normalizeTextValue) + .filter(Boolean); + + return uniqueValues(values); +} + +function extractStructuredAuthors(body: string): Array<{ + name: string; + address_refs: string[]; + addresses: string[]; +}> { + const lines = getTextLines(body); + const byIndex = lines.findIndex(line => line === 'By'); + if (byIndex < 0) return []; + + const addressMap = new Map(); + for (const line of extractSectionLines(body, 'Addresses', [ + 'E-mail Addresses', + 'Categories/ Classification', + ])) { + const match = line.match(/^(\d+)\s+(.+)$/); + if (!match) continue; + addressMap.set(match[1], normalizeTextValue(match[2])); + } + + const authors: Array<{ name: string; address_refs: string[]; addresses: string[] }> = []; + for (let index = byIndex + 1; index < lines.length; index++) { + const line = lines[index]; + if (isSectionBoundary(line, ['Addresses', 'E-mail Addresses', 'Keywords', 'Source', 'Abstract'])) break; + const match = line.match(/^(.+?)(\d+(?:,\d+)*)$/); + if (match) { + const name = normalizeTextValue(match[1]); + const refs = match[2].split(',').map(item => item.trim()).filter(Boolean); + if (!name || !refs.length) continue; + authors.push({ + name, + address_refs: refs, + addresses: refs.map(ref => addressMap.get(ref) || '').filter(Boolean), + }); + continue; + } + + const parsed = cleanAuthorLine(line); + if (!parsed) continue; + authors.push({ + name: parsed.name, + address_refs: parsed.refs, + addresses: parsed.refs.map(ref => addressMap.get(ref) || '').filter(Boolean), + }); + } + + return authors; +} + +export function extractSupplementMetadataFromText(body: string): Record { + const text = String(body || '').replace(/\u00a0/g, ' '); + const metadata: Record = {}; + const extract = (pattern: RegExp) => normalizeTextValue(text.match(pattern)?.[1] || ''); + + const regexFields = { + document_type: /Document Type\s+(.+?)\s+Abstract/s, + article_number: /Article Number\s+(.+?)\s+Published/s, + published: /Published\s+(.+?)\s+(?:Early Access|Indexed)/s, + early_access: /Early Access\s+(.+?)\s+Indexed/s, + indexed: /Indexed\s+(.+?)\s+Document Type/s, + language: /Language\s+(.+?)\s+Accession Number/s, + pubmed_id: /PubMed ID\s+(.+?)\s+ISSN/s, + issn: /PubMed ID\s+.+?\s+ISSN\s+(.+?)\s+IDS Number/s, + ids_number: /IDS Number\s+(.+?)\s+(?:add\s+See more data fields|Journal information)/s, + current_publisher: /Current Publisher\s+(.+?)\s+Journal Impact Factor/s, + } satisfies Record; + + for (const [key, pattern] of Object.entries(regexFields)) { + const value = extract(pattern); + if (value) metadata[key] = value; + } + + const fallbackFields: Array<[keyof typeof metadata | string, string, string[]]> = [ + ['language', 'Language', ['Accession Number', 'PubMed ID', 'ISSN']], + ['pubmed_id', 'PubMed ID', ['ISSN', 'IDS Number', 'Journal information']], + ['issn', 'ISSN', ['IDS Number', 'Journal information', 'Current Publisher']], + ['ids_number', 'IDS Number', ['Journal information', 'Current Publisher']], + ]; + + for (const [key, label, endLabels] of fallbackFields) { + if (!metadata[key]) { + const value = extractInlineOrSectionValue(text, label, endLabels); + if (value) metadata[key] = value; + } + } + + const citedReferences = text.match(/(\d+)\s+Cited References/)?.[1]; + if (citedReferences) metadata.cited_references = citedReferences; + + const correspondingSection = extractSectionLines(text, 'Corresponding Address', [ + 'E-mail Addresses', + 'Addresses', + 'Categories/ Classification', + ]).filter(line => !/\(corresponding author\)/i.test(line)); + const correspondingAddress = uniqueValues(correspondingSection).at(-1) ?? ''; + if (correspondingAddress) metadata.corresponding_address = correspondingAddress; + + const addressSection = extractSectionLines(text, 'Addresses', [ + 'E-mail Addresses', + 'Categories/ Classification', + ]); + const authorAddresses = uniqueValues(addressSection).join('; '); + if (authorAddresses) metadata.author_addresses = authorAddresses; + + const emails = uniqueValues(Array.from(text.matchAll(/[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}/gi), match => match[0])); + if (emails.length) metadata.email_addresses = emails.join('; '); + + const researchAreas = extractInlineOrSectionValue(text, 'Research Areas', [ + 'Citation Topics', + 'Web of Science Categories', + 'Journal information', + ]); + if (researchAreas) metadata.research_areas = researchAreas; + + const wosCategories = extractCategoryList(stripTrailingMetadataLabels(extractInlineOrSectionValue(text, 'Web of Science Categories', [ + 'See more data fields', + 'Journal information', + 'Journal Impact Factor', + 'Citation Network', + ]))); + if (wosCategories) metadata.wos_categories = wosCategories; + + const authorKeywords = extractSectionValueList(text, 'Author Keywords', [ + 'Keywords Plus', + 'Author Information', + 'Corresponding Address', + ]).join('; '); + if (authorKeywords) metadata.author_keywords = authorKeywords; + + const keywordsPlus = extractSectionValueList(text, 'Keywords Plus', [ + 'Author Information', + 'Corresponding Address', + 'Addresses', + ]).join('; '); + if (keywordsPlus) metadata.keywords_plus = keywordsPlus; + + const authorsStructured = extractStructuredAuthors(text); + if (authorsStructured.length) metadata.authors_structured = JSON.stringify(authorsStructured); + + const currentPublisherLines = extractSectionLines(text, 'Current Publisher', [ + 'Journal Impact Factor', + 'Journal Citation Reports TM', + 'Citation Network', + ]); + const currentPublisher = (uniqueValues(currentPublisherLines).join('; ') + || extractInlineOrSectionValue(text, 'Current Publisher', [ + 'Journal Impact Factor', + 'Journal Citation Reports TM', + 'Citation Network', + ])) + .replace(/([A-Z])(\d)/g, '$1; $2'); + if (currentPublisher) metadata.current_publisher = currentPublisher; + + if (metadata.wos_categories) { + metadata.wos_categories = metadata.wos_categories + .replace(/;\s*;/g, '; ') + .replace(/\s+/g, ' ') + .trim(); + } + + if (metadata.current_publisher) { + metadata.current_publisher = metadata.current_publisher + .replace(/;\s*;/g, '; ') + .replace(/\s+/g, ' ') + .trim(); + } + + if (metadata.authors_structured) { + try { + const parsed = JSON.parse(metadata.authors_structured) as Array<{ name?: string; address_refs?: string[]; addresses?: string[] }>; + metadata.authors_structured = JSON.stringify(parsed.filter((item) => { + const name = normalizeTextValue(String(item?.name || '')); + return Boolean(name) && /,/.test(name) && !/\b(view|provided|source|arrow|journal|impact|publisher)\b/i.test(name); + })); + } catch { + // Ignore malformed fallback author metadata. + } + } + + return metadata; +} + +async function scrapeRecordPageSupplement( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + evaluate: (js: string) => Promise; + }, + url: string, +): Promise { + await page.goto(url, { settleMs: 4000 }); + await page.wait(2); + + const supplement = await page.evaluate(`(async () => { + const normalize = (text) => String(text || '') + .replace(/\\u00a0/g, ' ') + .replace(/\\s+/g, ' ') + .trim(); + const isVisible = (el) => { + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + return style.display !== 'none' + && style.visibility !== 'hidden' + && rect.width > 0 + && rect.height > 0; + }; + + const fullTextButton = Array.from(document.querySelectorAll('button')) + .find((el) => isVisible(el) && /full text links/i.test(String(el.textContent || ''))); + if (fullTextButton) { + fullTextButton.click(); + await new Promise(resolve => setTimeout(resolve, 400)); + } + + const body = String(document.body.innerText || '').replace(/\\u00a0/g, ' '); + + const links = Array.from(document.querySelectorAll('a')) + .map((el) => ({ + label: normalize(el.textContent || el.getAttribute('aria-label') || ''), + url: String(el.href || '').trim(), + })) + .filter((item) => item.url); + + const filtered = []; + const seen = new Set(); + for (const item of links) { + const hay = (item.label + ' ' + item.url).toLowerCase(); + if (hay.includes('google scholar')) continue; + if (hay.includes('journal citation reports')) continue; + if (hay.includes('journal citation indicator')) continue; + if (hay.includes('accessibility')) continue; + if (hay.includes('/wos/pqdt/')) continue; + const isFullText = hay.includes('context sensitive') + || hay.includes('free full text') + || hay.includes('view full text') + || hay.includes('full text on proquest') + || hay.includes('repository') + || hay.includes('submitted article') + || hay.includes('getftr') + || /\\.pdf($|\\?)/i.test(item.url) + || (hay.includes('proquest') && hay.includes('full text')); + if (!isFullText) continue; + const key = item.url; + if (seen.has(key)) continue; + seen.add(key); + filtered.push({ + label: item.label || 'Full Text Link', + url: item.url, + }); + } + + return { bodyText: body, fullTextLinks: filtered }; + })()`); + + if (!supplement || typeof supplement !== 'object') { + return {}; + } + + const bodyText = typeof (supplement as { bodyText?: unknown }).bodyText === 'string' + ? (supplement as { bodyText: string }).bodyText + : ''; + + const legacyMetadata = typeof (supplement as { metadata?: unknown }).metadata === 'object' + && (supplement as { metadata?: unknown }).metadata !== null + ? (supplement as { metadata: Record }).metadata + : undefined; + + return { + metadata: bodyText ? extractSupplementMetadataFromText(bodyText) : legacyMetadata, + fullTextLinks: Array.isArray((supplement as { fullTextLinks?: unknown }).fullTextLinks) + ? (supplement as { fullTextLinks: Array<{ label?: string; url?: string }> }).fullTextLinks + : [], + }; +} + +function hasSupplementData(supplement: RecordPageSupplement): boolean { + return Boolean( + Object.keys(supplement.metadata ?? {}).length + || (supplement.fullTextLinks?.length ?? 0) > 0, + ); +} + +cli({ + site: 'webofscience', + name: 'record', + description: 'Fetch a Web of Science full record by UT, DOI, or full-record URL', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'id', positional: true, required: true, help: 'Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046' }, + { name: 'database', required: false, help: 'Database to search. Defaults to the database in the URL, otherwise woscc.', choices: ['woscc', 'alldb'] }, + ], + columns: ['field', 'value'], + func: async (page, kwargs) => { + const rawId = String(kwargs.id ?? '').trim(); + if (!rawId) { + throw new ArgumentError('Record identifier is required'); + } + + const identifier = parseRecordIdentifier(rawId); + if (!identifier) { + throw new ArgumentError('Record identifier must be a Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046'); + } + + const database = normalizeDatabase(kwargs.database, identifier.database ?? 'woscc'); + const sid = await ensureSearchSession(page, database, rawId); + const exactQuery = buildExactQuery(identifier); + const searchPayload = buildSearchPayload(rawId, 5, database, exactQuery); + + const searchEvents = await page.evaluate(`(async () => { + const payload = ${JSON.stringify(searchPayload)}; + const res = await fetch('/api/wosnx/core/runQuerySearch?SID=' + encodeURIComponent(${JSON.stringify(sid)}), { + method: 'POST', + credentials: 'include', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(payload) + }); + return res.json(); + })()`); + + const queryId = extractQueryId(searchEvents); + const records = extractRecords(searchEvents); + const match = findMatchingRecord(records, identifier); + + if (!queryId || !match?.record) { + throw new EmptyResultError('webofscience record', 'Try using a Web of Science UT, DOI, or verify your Web of Science access in Chrome'); + } + + const product = toProduct(database); + const fullRecordPayload = buildFullRecordPayload({ + qid: queryId, + docNumber: match.docNumber, + product, + coll: match.record.coll ?? product, + searchMode: 'general_semantic', + }); + + let record = match.record; + try { + const fullRecordEvents = await page.evaluate(`(async () => { + const payload = ${JSON.stringify(fullRecordPayload)}; + const res = await fetch('/api/wosnx/core/getFullRecordByQueryId?SID=' + encodeURIComponent(${JSON.stringify(sid)}), { + method: 'POST', + credentials: 'include', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(payload) + }); + return res.json(); + })()`); + + const fullRecord = extractFullRecord(fullRecordEvents); + if (fullRecord) { + record = fullRecord; + } + } catch { + // Fall back to the exact-match search record. The full-record endpoint + // can return HTML when the site decides to render a page flow instead. + } + + const recordUrl = record.ut ? fullRecordUrl(database, record.ut) : ''; + let supplement: RecordPageSupplement = {}; + if (recordUrl) { + for (let attempt = 0; attempt < 2; attempt++) { + try { + supplement = await scrapeRecordPageSupplement(page, recordUrl); + if (hasSupplementData(supplement)) break; + } catch { + // DOM enrichment is best-effort; keep the structured API result. + } + } + } + + const fullTextLinks = (supplement.fullTextLinks ?? []) + .map(link => (link.label || '').trim()) + .filter(Boolean) + .join('; '); + const fullTextUrls = (supplement.fullTextLinks ?? []) + .map(link => (link.url || '').trim()) + .filter(Boolean) + .join('; '); + const metadata = supplement.metadata ?? {}; + const authorKeywords = extractKeywordGroup(record, 'author_keywords') || metadata.author_keywords || ''; + const keywordsPlus = extractKeywordGroup(record, 'keywords_plus') || metadata.keywords_plus || ''; + + const rows = [ + { field: 'title', value: firstTitle(record, 'item') }, + { field: 'authors', value: formatAuthors(record) }, + { field: 'year', value: record.pub_info?.pubyear ?? '' }, + { field: 'source', value: firstTitle(record, 'source') }, + { field: 'doi', value: record.doi ?? '' }, + { field: 'ut', value: record.ut ?? match.record.ut ?? '' }, + { field: 'abstract', value: extractAbstract(record) }, + { field: 'document_type', value: metadata.document_type ?? '' }, + { field: 'article_number', value: metadata.article_number ?? '' }, + { field: 'published', value: metadata.published ?? '' }, + { field: 'early_access', value: metadata.early_access ?? '' }, + { field: 'indexed', value: metadata.indexed ?? '' }, + { field: 'language', value: metadata.language ?? '' }, + { field: 'pubmed_id', value: metadata.pubmed_id ?? '' }, + { field: 'issn', value: metadata.issn ?? '' }, + { field: 'ids_number', value: metadata.ids_number ?? '' }, + { field: 'corresponding_address', value: metadata.corresponding_address ?? '' }, + { field: 'author_addresses', value: metadata.author_addresses ?? '' }, + { field: 'email_addresses', value: metadata.email_addresses ?? '' }, + { field: 'research_areas', value: metadata.research_areas ?? '' }, + { field: 'wos_categories', value: metadata.wos_categories ?? '' }, + { field: 'authors_structured', value: metadata.authors_structured ?? '' }, + { field: 'current_publisher', value: metadata.current_publisher ?? '' }, + { field: 'author_keywords', value: authorKeywords }, + { field: 'keywords_plus', value: keywordsPlus }, + { field: 'citations_woscc', value: String(record.citation_related?.counts?.WOSCC ?? '') }, + { field: 'citations_alldb', value: String(record.citation_related?.counts?.ALLDB ?? '') }, + { field: 'cited_references', value: metadata.cited_references ?? '' }, + { field: 'full_text_links', value: fullTextLinks }, + { field: 'full_text_urls', value: fullTextUrls }, + { field: 'url', value: recordUrl }, + ].filter(row => row.value !== ''); + + if (!rows.length) { + throw new CommandExecutionError( + 'Web of Science record response was empty', + 'Try running the command again or opening the record once in Chrome.', + ); + } + + return rows; + }, +}); diff --git a/src/clis/webofscience/references.test.ts b/src/clis/webofscience/references.test.ts new file mode 100644 index 00000000..0241d370 --- /dev/null +++ b/src/clis/webofscience/references.test.ts @@ -0,0 +1,119 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import { parseWosEventStream } from './shared.js'; +import './references.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience references', () => { + it('describes reference lookup identifiers and database inference in command help', () => { + const cmd = getRegistry().get('webofscience/references'); + const idArg = cmd?.args.find(arg => arg.name === 'id'); + const databaseArg = cmd?.args.find(arg => arg.name === 'database'); + + expect(idArg?.help).toContain('WOS:'); + expect(idArg?.help).toContain('DOI'); + expect(idArg?.help).toContain('full-record URL'); + expect(databaseArg?.help).toContain('Defaults to the database in the URL'); + }); + + it('parses summary stream payloads that arrive as a JSON array', () => { + expect(parseWosEventStream(JSON.stringify([ + { key: 'searchInfo', payload: { QueryID: 'QIDREFS', RecordsFound: 2 } }, + { key: 'records', payload: { 1: { ut: 'WOS:001' }, 2: { ut: 'WOS:002' } } }, + ]))).toEqual([ + { key: 'searchInfo', payload: { QueryID: 'QIDREFS', RecordsFound: 2 } }, + { key: 'records', payload: { 1: { ut: 'WOS:001' }, 2: { ut: 'WOS:002' } } }, + ]); + }); + + it('loads a cited references summary via the records stream endpoint', async () => { + const cmd = getRegistry().get('webofscience/references'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { + streamText: [ + '{"id":0,"key":"searchInfo","payload":{"QueryID":"QIDREFS","RecordsFound":71}}', + '{"api":"runQueryGetRecordsStream","id":1,"key":"records","payload":{"1":{"ut":"123456789","doi":"10.1000/ref.1","titles":{"source":{"en":[{"title":"SCIENCE"}]}},"names":{"author":{"en":[{"wos_standard":"Doe, J"}]}},"pub_info":{"pubyear":"2021"},"citation_related":{"counts":{"WOSCC":7}}}}}', + ].join('\n'), + debug: {}, + }, + ]); + + const result = await cmd!.func!(page, { id: 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:001335131500001', limit: 1 }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:001335131500001', + { settleMs: 5000 }, + ); + + const navigateJs = vi.mocked(page.evaluate).mock.calls[0]?.[0]; + expect(navigateJs).toContain('location.href'); + expect(navigateJs).toContain('cited-references-summary/WOS:001335131500001'); + + const fetchJs = vi.mocked(page.evaluate).mock.calls[1]?.[0]; + expect(fetchJs).toContain(`localStorage.getItem('wos_search_' + qid)`); + expect(fetchJs).toContain(`searchState?.mode || "cited_references"`); + expect(fetchJs).toContain(`/api/wosnx/core/runQueryGetRecordsStream?SID=`); + + expect(result).toEqual([ + { + rank: 1, + title: 'SCIENCE', + authors: 'Doe, J', + year: '2021', + source: 'SCIENCE', + citations: 7, + doi: '10.1000/ref.1', + url: '', + }, + ]); + }); + + it('throws EmptyResultError when the cited references summary has no records', async () => { + const cmd = getRegistry().get('webofscience/references'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + true, + { streamText: '', debug: {} }, + { streamText: '', debug: {} }, + ]); + + await expect(cmd!.func!(page, { id: 'WOS:001335131500001' })).rejects.toThrow(EmptyResultError); + }); +}); diff --git a/src/clis/webofscience/references.ts b/src/clis/webofscience/references.ts new file mode 100644 index 00000000..2a2b9861 --- /dev/null +++ b/src/clis/webofscience/references.ts @@ -0,0 +1,116 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { + buildExactQuery, + buildSearchPayload, + citedReferencesSummaryUrl, + clampLimit, + ensureSearchSession, + extractRecords, + fetchCurrentSummaryStreamRecords, + findMatchingRecord, + firstTitle, + formatAuthors, + fullRecordUrl, + normalizeDatabase, + parseRecordIdentifier, + type WosRecord, +} from './shared.js'; + +function referenceTitle(record: WosRecord): string { + return firstTitle(record, 'item') || firstTitle(record, 'source'); +} + +function referenceUrl(database: 'woscc' | 'alldb', record: WosRecord): string { + return /^WOS:/i.test(String(record.ut || '')) ? fullRecordUrl(database, String(record.ut)) : ''; +} + +async function resolveUt( + page: { + evaluate: (js: string) => Promise; + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + typeText: (selector: string, text: string) => Promise; + click: (selector: string) => Promise; + pressKey: (key: string) => Promise; + }, + rawId: string, + database: 'woscc' | 'alldb', +): Promise { + const identifier = parseRecordIdentifier(rawId); + if (!identifier) { + throw new ArgumentError('Record identifier must be a Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046'); + } + if (identifier.kind === 'ut') return identifier.value; + + const sid = await ensureSearchSession(page, database, rawId); + const events = await page.evaluate(`(async () => { + const payload = ${JSON.stringify(buildSearchPayload(rawId, 5, database, buildExactQuery(identifier)))}; + const res = await fetch('/api/wosnx/core/runQuerySearch?SID=' + encodeURIComponent(${JSON.stringify(sid)}), { + method: 'POST', + credentials: 'include', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(payload), + }); + return res.json(); + })()`); + const match = findMatchingRecord(extractRecords(events), identifier); + if (!match?.record?.ut) { + throw new EmptyResultError('webofscience references', 'Try using a Web of Science UT or full-record URL.'); + } + return match.record.ut; +} + +cli({ + site: 'webofscience', + name: 'references', + description: 'List cited references for a Web of Science record', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'id', positional: true, required: true, help: 'Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046' }, + { name: 'database', required: false, help: 'Database to use. Defaults to the database in the URL, otherwise woscc.', choices: ['woscc', 'alldb'] }, + { name: 'limit', type: 'int', default: 10, help: 'Max results (max 50)' }, + ], + columns: ['rank', 'title', 'authors', 'year', 'source', 'citations', 'doi', 'url'], + func: async (page, kwargs) => { + const rawId = String(kwargs.id ?? '').trim(); + if (!rawId) throw new ArgumentError('Record identifier is required'); + + const identifier = parseRecordIdentifier(rawId); + if (!identifier) { + throw new ArgumentError('Record identifier must be a Web of Science UT, DOI, or full-record URL, e.g. WOS:001335131500001 or 10.1016/j.patter.2024.101046'); + } + + const database = normalizeDatabase(kwargs.database, identifier.database ?? 'woscc'); + const limit = clampLimit(kwargs.limit); + const ut = await resolveUt(page, rawId, database); + const summaryUrl = citedReferencesSummaryUrl(database, ut); + await page.goto(fullRecordUrl(database, ut), { settleMs: 5000 }); + await page.wait(4); + await page.evaluate(`(() => { location.href = ${JSON.stringify(summaryUrl)}; return true; })()`); + const records = await fetchCurrentSummaryStreamRecords(page, database, limit, 'cited_references'); + + const rows = records + .slice(0, limit) + .map((record, index) => ({ + rank: index + 1, + title: referenceTitle(record), + authors: formatAuthors(record), + year: record.pub_info?.pubyear ?? '', + source: firstTitle(record, 'source'), + citations: record.citation_related?.counts?.WOSCC ?? 0, + doi: record.doi ?? '', + url: referenceUrl(database, record), + })) + .filter(row => row.title); + + if (!rows.length) { + throw new EmptyResultError('webofscience references', 'Try opening the cited references summary in Chrome once, then run again.'); + } + + return rows; + }, +}); diff --git a/src/clis/webofscience/shared.ts b/src/clis/webofscience/shared.ts new file mode 100644 index 00000000..24418635 --- /dev/null +++ b/src/clis/webofscience/shared.ts @@ -0,0 +1,942 @@ +import { ArgumentError, CommandExecutionError } from '../../errors.js'; + +export const SEARCH_INPUT_SELECTOR = '#composeQuerySmartSearch'; +export const SUBMIT_BUTTON_SELECTOR = "button[aria-label='Submit your question']"; +export const MAX_LIMIT = 50; + +export type WosDatabase = 'woscc' | 'alldb'; +export type BasicSearchFieldKey = + | 'all_fields' + | 'topic' + | 'title' + | 'author' + | 'publication_titles' + | 'year_published' + | 'affiliation' + | 'funding_agency' + | 'publisher' + | 'publication_date' + | 'abstract' + | 'accession_number' + | 'address' + | 'author_identifiers' + | 'author_keywords' + | 'conference' + | 'document_type' + | 'doi' + | 'editor' + | 'grant_number' + | 'group_author' + | 'keyword_plus' + | 'language' + | 'pubmed_id' + | 'web_of_science_categories'; + +export type BasicSearchFieldSpec = { + key: BasicSearchFieldKey; + label: string; + tag: string; + aliases: string[]; +}; + +export type WosEvent = { + key?: string; + payload?: Record; +}; + +export type WosRecord = { + ut?: string; + doi?: string; + coll?: string; + titles?: { + item?: { en?: Array<{ title?: string }> }; + source?: { en?: Array<{ title?: string }> }; + }; + names?: { + author?: { en?: Array<{ first_name?: string; last_name?: string; wos_standard?: string }> }; + }; + pub_info?: { + pubyear?: string; + sortdate?: string; + }; + abstract?: { + basic?: { + en?: { + abstract?: string | string[]; + }; + }; + }; + keywords?: Record }>; + citation_related?: { + counts?: Record; + }; +}; + +type RecordIdentifier = + | { kind: 'ut'; value: string; database?: WosDatabase } + | { kind: 'doi'; value: string; database?: WosDatabase }; + +type AuthorRecordIdentifier = { + id: string; +}; + +export function clampLimit(value: unknown): number { + const parsed = Number(value ?? 10); + if (!Number.isFinite(parsed) || parsed <= 0) return 10; + return Math.min(Math.floor(parsed), MAX_LIMIT); +} + +export function normalizeDatabase(value: unknown, fallback: WosDatabase = 'woscc'): WosDatabase { + if (value == null || value === '') return fallback; + const normalized = String(value).trim().toLowerCase(); + if (normalized === 'woscc' || normalized === 'alldb') return normalized; + throw new ArgumentError(`Unsupported Web of Science database: ${String(value)}`); +} + +export function toProduct(database: WosDatabase): 'WOSCC' | 'ALLDB' { + return database === 'alldb' ? 'ALLDB' : 'WOSCC'; +} + +export function smartSearchUrl(database: WosDatabase): string { + return `https://webofscience.clarivate.cn/wos/${database}/smart-search`; +} + +export function basicSearchUrl(database: WosDatabase): string { + return `https://webofscience.clarivate.cn/wos/${database}/basic-search`; +} + +export function fullRecordUrl(database: WosDatabase, ut: string): string { + return `https://webofscience.clarivate.cn/wos/${database}/full-record/${ut}`; +} + +export function citingSummaryUrl(database: WosDatabase, ut: string): string { + return `https://webofscience.clarivate.cn/wos/${database}/citing-summary/${ut}?from=${database}&type=colluid&siloSearchWarning=false`; +} + +export function citedReferencesSummaryUrl(database: WosDatabase, ut: string): string { + return `https://webofscience.clarivate.cn/wos/${database}/cited-references-summary/${ut}?from=${database}&type=colluid`; +} + +export function authorRecordUrl(id: string): string { + return `https://webofscience.clarivate.cn/wos/author/record/${id}`; +} + +export function buildSearchPayload( + query: string, + limit: number, + database: WosDatabase, + rowText = `TS=(${query})`, +): Record { + const product = toProduct(database); + + return { + product, + searchMode: 'general_semantic', + viewType: 'search', + serviceMode: 'summary', + search: { + mode: 'general_semantic', + database: product, + disableEdit: false, + query: [{ rowText }], + display: { + key: 'nlp', + params: { input: query }, + }, + blending: 'blended', + count: 100, + }, + retrieve: { + count: limit, + history: true, + jcr: true, + sort: 'relevance', + analyzes: [ + 'TP.Value.6', + 'REVIEW.Value.6', + 'EARLY ACCESS.Value.6', + 'OA.Value.6', + 'DR.Value.6', + 'ECR.Value.6', + 'PY.Field_D.6', + 'FPY.Field_D.6', + 'DT.Value.6', + 'AU.Value.6', + 'DX2NG.Value.6', + 'PEERREVIEW.Value.6', + 'STK.Value.10', + ], + locale: 'en', + }, + eventMode: null, + }; +} + +const BASIC_SEARCH_FIELDS: BasicSearchFieldSpec[] = [ + { key: 'all_fields', label: 'All Fields', tag: 'ALL', aliases: ['all-fields', 'all fields', 'all_fields', 'all'] }, + { key: 'topic', label: 'Topic', tag: 'TS', aliases: ['topic', 'ts'] }, + { key: 'title', label: 'Title', tag: 'TI', aliases: ['title', 'ti'] }, + { key: 'author', label: 'Author', tag: 'AU', aliases: ['author', 'au'] }, + { key: 'publication_titles', label: 'Publication Titles', tag: 'SO', aliases: ['publication-titles', 'publication titles', 'publication_titles', 'publication title', 'source', 'so'] }, + { key: 'year_published', label: 'Year Published', tag: 'PY', aliases: ['year-published', 'year published', 'year_published', 'year', 'py'] }, + { key: 'affiliation', label: 'Affiliation', tag: 'OG', aliases: ['affiliation', 'organization-enhanced', 'organization_enhanced', 'organization enhanced', 'og'] }, + { key: 'funding_agency', label: 'Funding Agency', tag: 'FO', aliases: ['funding-agency', 'funding agency', 'funding_agency', 'fo'] }, + { key: 'publisher', label: 'Publisher', tag: 'PUBL', aliases: ['publisher', 'publ'] }, + { key: 'publication_date', label: 'Publication Date', tag: 'DOP', aliases: ['publication-date', 'publication date', 'publication_date', 'date of publication', 'dop'] }, + { key: 'abstract', label: 'Abstract', tag: 'AB', aliases: ['abstract', 'ab'] }, + { key: 'accession_number', label: 'Accession Number', tag: 'UT', aliases: ['accession-number', 'accession number', 'accession_number', 'ut'] }, + { key: 'address', label: 'Address', tag: 'AD', aliases: ['address', 'ad'] }, + { key: 'author_identifiers', label: 'Author Identifiers', tag: 'AI', aliases: ['author-identifiers', 'author identifiers', 'author_identifiers', 'ai'] }, + { key: 'author_keywords', label: 'Author Keywords', tag: 'AK', aliases: ['author-keywords', 'author keywords', 'author_keywords', 'ak'] }, + { key: 'conference', label: 'Conference', tag: 'CF', aliases: ['conference', 'cf'] }, + { key: 'document_type', label: 'Document Type', tag: 'DT', aliases: ['document-type', 'document type', 'document_type', 'dt'] }, + { key: 'doi', label: 'DOI', tag: 'DO', aliases: ['doi', 'do'] }, + { key: 'editor', label: 'Editor', tag: 'ED', aliases: ['editor', 'ed'] }, + { key: 'grant_number', label: 'Grant Number', tag: 'FG', aliases: ['grant-number', 'grant number', 'grant_number', 'fg'] }, + { key: 'group_author', label: 'Group Author', tag: 'GP', aliases: ['group-author', 'group author', 'group_author', 'gp'] }, + { key: 'keyword_plus', label: 'Keyword Plus', tag: 'KP', aliases: ['keyword-plus', 'keyword plus', 'keyword_plus', 'keywords plus', 'keywords-plus', 'kp'] }, + { key: 'language', label: 'Language', tag: 'LA', aliases: ['language', 'la'] }, + { key: 'pubmed_id', label: 'PubMed ID', tag: 'PMID', aliases: ['pubmed-id', 'pubmed id', 'pubmed_id', 'pmid'] }, + { key: 'web_of_science_categories', label: 'Web of Science Categories', tag: 'WC', aliases: ['web-of-science-categories', 'web of science categories', 'web_of_science_categories', 'wos categories', 'wc'] }, +]; + +const BASIC_SEARCH_FIELD_HELP_EXAMPLES = ['topic', 'title', 'author', 'doi', 'web-of-science-categories'] as const; + +export function listBasicSearchFields(): BasicSearchFieldSpec[] { + return [...BASIC_SEARCH_FIELDS]; +} + +export function basicSearchFieldHelpText(): string { + return 'Field to search in. Default: topic. Common: topic, title, author, doi, WOS categories'; +} + +export function normalizeBasicSearchField(value: unknown): BasicSearchFieldSpec { + if (value == null || value === '') { + return BASIC_SEARCH_FIELDS.find(field => field.key === 'topic')!; + } + + const normalized = String(value).trim().toLowerCase(); + const match = BASIC_SEARCH_FIELDS.find(field => + field.aliases.includes(normalized) + || field.key === normalized.replace(/[\s-]+/g, '_') + || field.label.toLowerCase() === normalized); + + if (!match) { + throw new ArgumentError( + `Unsupported Web of Science basic-search field: ${String(value)}. Try one of: ${BASIC_SEARCH_FIELD_HELP_EXAMPLES.join(', ')}`, + ); + } + + return match; +} + +export function buildBasicSearchRowText(query: string, field: unknown): string { + const spec = normalizeBasicSearchField(field); + return `${spec.tag}=(${query})`; +} + +export function extractSessionState(page: { evaluate: (js: string) => Promise }): Promise<{ sid?: string | null; href?: string }> { + return page.evaluate(`(() => { + const entry = performance.getEntriesByType('resource') + .find(e => String(e.name).includes('/api/wosnx/core/runQuerySearch?SID=')); + const sid = entry ? new URL(entry.name).searchParams.get('SID') : null; + return { sid, href: location.href }; + })()`); +} + +export async function ensureSearchSession( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + typeText: (selector: string, text: string) => Promise; + click: (selector: string) => Promise; + pressKey: (key: string) => Promise; + evaluate: (js: string) => Promise; + }, + database: WosDatabase, + query: string, +): Promise { + return ensureSearchSessionAtUrl(page, smartSearchUrl(database), query, SEARCH_INPUT_SELECTOR); +} + +export async function ensureSearchSessionAtUrl( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + typeText: (selector: string, text: string) => Promise; + click: (selector: string) => Promise; + pressKey: (key: string) => Promise; + evaluate: (js: string) => Promise; + }, + url: string, + query: string, + preferredSelector?: string, +): Promise { + await page.goto(url, { settleMs: 4000 }); + await page.wait(2); + await typeIntoSearch(page, query, preferredSelector); + await page.wait(1); + await submitSearch(page); + await page.wait(6); + + let session = await extractSessionState(page); + if (!session?.sid) { + await submitSearch(page); + await page.wait(10); + session = await extractSessionState(page); + } + + if (!session?.sid) { + throw new CommandExecutionError( + 'Web of Science search session was not established', + 'The page may still be waiting for passive verification. Try again in Chrome.', + ); + } + + return session.sid; +} + +export function isWosSubmitControl(input: { + text?: string | null; + type?: string | null; + ariaLabel?: string | null; +}): boolean { + const text = String(input.text || '').trim().toLowerCase(); + const type = String(input.type || '').trim().toLowerCase(); + const ariaLabel = String(input.ariaLabel || '').trim().toLowerCase(); + const hay = `${text} ${ariaLabel}`.trim(); + + if (!hay && type !== 'submit') return false; + if (hay.includes('history')) return false; + if (hay.includes('saved searches')) return false; + if (hay.includes('search history')) return false; + + return type === 'submit' + || /^search\b/.test(hay) + || hay.includes('submit your question'); +} + +async function submitSearch(page: { + click: (selector: string) => Promise; + pressKey: (key: string) => Promise; + evaluate: (js: string) => Promise; +}): Promise { + try { + await page.click(SUBMIT_BUTTON_SELECTOR); + return; + } catch {} + + const submitRef = await findVisibleSubmitButtonRef(page); + if (submitRef) { + try { + await page.click(String(submitRef)); + return; + } catch {} + } + + await page.pressKey('Enter'); +} + +async function findVisibleSubmitButtonRef(page: { evaluate: (js: string) => Promise }): Promise { + const ref = await page.evaluate(`(() => { + const submitRef = 'opencli-search-submit'; + const isVisible = (el) => { + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + return style.display !== 'none' + && style.visibility !== 'hidden' + && rect.width > 0 + && rect.height > 0; + }; + for (const node of document.querySelectorAll('[data-ref="opencli-search-submit"]')) { + node.removeAttribute('data-ref'); + } + const buttons = Array.from(document.querySelectorAll('button, input[type="submit"]')) + .filter((el) => !el.disabled && isVisible(el)); + const target = buttons.find((el) => { + const text = String(el.textContent || el.getAttribute('value') || '').trim(); + const type = String(el.getAttribute('type') || '').toLowerCase(); + const ariaLabel = String(el.getAttribute('aria-label') || '').trim(); + const hay = (text + ' ' + ariaLabel).toLowerCase(); + if (hay.includes('history')) return false; + if (hay.includes('saved searches')) return false; + if (hay.includes('search history')) return false; + return type === 'submit' + || /^search\b/.test(hay) + || hay.includes('submit your question'); + }); + if (!target) return null; + target.setAttribute('data-ref', submitRef); + return submitRef; + })()`); + return typeof ref === 'string' ? ref : null; +} + +async function typeIntoSearch( + page: { + wait: (seconds: number) => Promise; + typeText: (selector: string, text: string) => Promise; + evaluate: (js: string) => Promise; + }, + query: string, + preferredSelector?: string, +): Promise { + const discoveredRef = 'opencli-search-input'; + + if (preferredSelector) { + try { + await page.typeText(preferredSelector, query); + return; + } catch { + // Fall back to generic input discovery below. + } + } + + let selector: string | null = null; + for (let attempt = 0; attempt < 3; attempt++) { + selector = await page.evaluate(`(() => { + const isVisible = (el) => { + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + return style.display !== 'none' + && style.visibility !== 'hidden' + && rect.width > 0 + && rect.height > 0; + }; + for (const node of document.querySelectorAll('[data-ref="opencli-search-input"]')) { + node.removeAttribute('data-ref'); + } + const candidates = Array.from(document.querySelectorAll('input, textarea')) + .filter((el) => !el.disabled && !el.readOnly && isVisible(el)) + .sort((a, b) => { + const aScore = (a.matches('input[type="search"], input[type="text"], textarea') ? 10 : 0) + (a.placeholder ? 2 : 0); + const bScore = (b.matches('input[type="search"], input[type="text"], textarea') ? 10 : 0) + (b.placeholder ? 2 : 0); + return bScore - aScore; + }); + const target = candidates[0]; + if (!target) return null; + target.setAttribute('data-ref', ${JSON.stringify(discoveredRef)}); + return ${JSON.stringify(discoveredRef)}; + })()`); + if (selector) break; + if (attempt < 2) { + await page.wait(2); + } + } + + if (!selector) { + throw new CommandExecutionError( + 'Web of Science search input was not found', + 'The search page may not have finished loading. Try again in Chrome.', + ); + } + + try { + await page.typeText(String(selector), query); + } catch { + await page.wait(4); + await page.typeText(String(selector), query); + } +} + +export function formatAuthors(record: WosRecord): string { + const authors = record.names?.author?.en ?? []; + return authors + .map(author => { + if (!author) return ''; + if (author.wos_standard) return author.wos_standard; + const last = author.last_name?.trim(); + const first = author.first_name?.trim(); + if (last && first) return `${last}, ${first}`; + return last || first || ''; + }) + .filter(Boolean) + .join('; '); +} + +export function firstTitle(record: WosRecord, branch: 'item' | 'source'): string { + return record.titles?.[branch]?.en?.[0]?.title ?? ''; +} + +export function extractRecords(events: unknown): WosRecord[] { + if (!Array.isArray(events)) return []; + const eventList = events as WosEvent[]; + + const errors = eventList + .filter(event => event?.key === 'error') + .flatMap(event => Array.isArray(event.payload) ? event.payload : []); + if (errors.includes('Server.passiveVerificationRequired')) { + throw new CommandExecutionError( + 'Web of Science requested passive verification before search results could be fetched', + 'Try again in Chrome after the verification completes.', + ); + } + if (errors.includes('Server.sessionNotFound')) { + throw new CommandExecutionError( + 'Web of Science search session expired before results could be fetched', + 'Try running the command again.', + ); + } + + const recordsPayload = eventList.find(event => event?.key === 'records')?.payload ?? {}; + return Object.values(recordsPayload) as WosRecord[]; +} + +export function extractQueryId(events: unknown): string { + if (!Array.isArray(events)) return ''; + const eventList = events as WosEvent[]; + return String(eventList.find(event => event?.key === 'searchInfo')?.payload?.QueryID ?? ''); +} + +export function parseRecordIdentifier(input: string): RecordIdentifier | null { + const trimmed = input.trim(); + if (!trimmed) return null; + + try { + const url = new URL(trimmed); + if (/doi\.org$/i.test(url.hostname)) { + const doi = decodeURIComponent(url.pathname.replace(/^\/+/, '')); + return doi ? { kind: 'doi', value: doi } : null; + } + + const match = url.pathname.match(/\/wos\/(woscc|alldb)\/full-record\/([^/?#]+)/i); + if (match) { + return { + kind: 'ut', + value: decodeURIComponent(match[2]), + database: normalizeDatabase(match[1]), + }; + } + } catch { + // Not a URL; continue parsing as a bare identifier. + } + + if (/^WOS:[A-Z0-9]+$/i.test(trimmed)) { + return { kind: 'ut', value: trimmed.toUpperCase() }; + } + + if (/^10\.\d{4,9}\/\S+$/i.test(trimmed)) { + return { kind: 'doi', value: trimmed }; + } + + return null; +} + +export function parseAuthorRecordIdentifier(input: string): AuthorRecordIdentifier | null { + const trimmed = input.trim(); + if (!trimmed) return null; + + try { + const url = new URL(trimmed); + const match = url.pathname.match(/\/wos\/author\/record\/([^/?#]+)/i); + if (match) { + return { id: decodeURIComponent(match[1]) }; + } + } catch { + // Not a URL; continue parsing as a bare identifier. + } + + if (/^\d+$/.test(trimmed)) { + return { id: trimmed }; + } + + return null; +} + +export function buildExactQuery(identifier: RecordIdentifier): string { + return identifier.kind === 'ut' + ? `UT=(${identifier.value})` + : `DO=(${identifier.value})`; +} + +export function findMatchingRecord(records: WosRecord[], identifier: RecordIdentifier): { record: WosRecord; docNumber: number } | null { + const needle = identifier.value.trim().toLowerCase(); + + for (const [index, record] of records.entries()) { + if (identifier.kind === 'ut' && record.ut?.trim().toLowerCase() === needle) { + return { record, docNumber: index + 1 }; + } + if (identifier.kind === 'doi' && record.doi?.trim().toLowerCase() === needle) { + return { record, docNumber: index + 1 }; + } + } + + return records[0] ? { record: records[0], docNumber: 1 } : null; +} + +export function buildFullRecordPayload(params: { + qid: string; + docNumber: number; + product: string; + coll?: string; + searchMode?: string; +}): Record { + const { qid, docNumber, product, coll = product, searchMode = 'general_semantic' } = params; + + return { + qid, + id: docNumber, + retrieve: { + first: docNumber, + links: 'retrieve', + sort: 'relevance', + count: 1, + view: 'full', + coll, + activity: true, + analyzes: null, + jcr: true, + reviews: true, + highlight: false, + locale: 'en', + }, + product, + searchMode, + serviceMode: 'summary', + viewType: 'records', + paginated: false, + }; +} + +export function extractFullRecord(events: unknown): WosRecord | null { + if (!Array.isArray(events)) return null; + const eventList = events as WosEvent[]; + return (eventList.find(event => event?.key === 'full-record')?.payload as WosRecord | undefined) ?? null; +} + +function joinValues(items: Array | undefined): string { + return (items ?? []) + .map(item => { + if (typeof item === 'string') return item.trim(); + return item.keyword?.trim() || item.value?.trim() || item.text?.trim() || ''; + }) + .filter(Boolean) + .join('; '); +} + +export function extractAbstract(record: WosRecord): string { + const value = record.abstract?.basic?.en?.abstract; + const text = Array.isArray(value) ? value.filter(Boolean).join(' ') : (typeof value === 'string' ? value : ''); + return text + .replace(/<[^>]+>/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +export function extractKeywordGroup(record: WosRecord, key: string): string { + return joinValues(record.keywords?.[key]?.en); +} + +export async function fetchSummaryRecords( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + evaluate: (js: string) => Promise; + }, + url: string, + database: WosDatabase, + limit: number, + defaultMode: string, +): Promise { + async function fetchOnce(): Promise { + return page.evaluate(`(async () => { + const href = String(location.href || ''); + const summaryId = href.match(/\\/summary\\/([^/]+)/)?.[1] || ''; + const pageNumber = Number(href.match(/\\/summary\\/[^/]+\\/[^/]+\\/(\\d+)/)?.[1] || '1') || 1; + const sort = href.match(/\\/summary\\/[^/]+\\/([^/]+)\\/\\d+/)?.[1] || 'relevance'; + const sid = (() => { + try { return JSON.parse(String(localStorage.getItem('wos_sid') || '""')) || ''; } catch { return ''; } + })(); + if (!summaryId || !sid) return []; + + const rawState = localStorage.getItem('wos_search_' + summaryId); + const searchState = rawState ? JSON.parse(rawState) : null; + const product = ${JSON.stringify(toProduct(database))}; + const retrieveBase = { + count: ${limit}, + first: Math.max(1, ((pageNumber - 1) * ${limit}) + 1), + sort, + locale: 'en', + jcr: true, + history: true, + }; + const baseState = { + ...(searchState || { id: summaryId, mode: ${JSON.stringify(defaultMode)}, database: product }), + id: summaryId, + database: searchState?.database || product, + product, + serviceMode: 'summary', + }; + const candidates = [ + { + ...baseState, + retrieve: retrieveBase, + searchMode: searchState?.mode || ${JSON.stringify(defaultMode)}, + viewType: 'summary', + paginated: true, + }, + { + ...baseState, + retrieve: { ...retrieveBase, coll: searchState?.database || product, view: 'summary' }, + searchMode: 'GeneralSearch', + viewType: 'records', + paginated: true, + }, + { + ...baseState, + retrieve: { ...retrieveBase, coll: searchState?.database || product, activity: true }, + searchMode: 'GeneralSearch', + viewType: 'summary', + paginated: false, + }, + { + ...baseState, + retrieve: { ...retrieveBase, coll: searchState?.database || product, activity: true }, + searchMode: searchState?.mode || 'GeneralSearch', + viewType: 'records', + paginated: false, + }, + ]; + + for (const payload of candidates) { + const res = await fetch('/api/wosnx/core/runQuerySearch?SID=' + encodeURIComponent(sid), { + method: 'POST', + credentials: 'include', + headers: { + accept: 'application/json, text/plain, */*', + 'content-type': 'application/json', + }, + body: JSON.stringify(payload), + }); + const text = await res.text(); + if (!text || /^ line.trim()) + .filter(Boolean) + .flatMap((line) => { + try { + return [JSON.parse(line) as WosEvent]; + } catch { + return []; + } + }); +} + +export async function fetchCurrentSummaryStreamRecords( + page: { + wait: (seconds: number) => Promise; + evaluate: (js: string) => Promise; + }, + database: WosDatabase, + limit: number, + defaultMode: string, +): Promise { + async function fetchOnce(): Promise<{ streamText: string; debug: Record }> { + return page.evaluate(`(async () => { + const href = String(location.href || ''); + const qid = href.match(/\\/summary\\/([^/]+)/)?.[1] || ''; + const pageNumber = Number(href.match(/\\/summary\\/[^/]+\\/[^/]+\\/(\\d+)/)?.[1] || '1') || 1; + const sort = href.match(/\\/summary\\/[^/]+\\/([^/]+)\\/\\d+/)?.[1] || 'relevance'; + const sid = (() => { + try { return JSON.parse(String(localStorage.getItem('wos_sid') || '""')) || ''; } catch { return ''; } + })(); + const searchState = (() => { + if (!qid) return null; + try { return JSON.parse(String(localStorage.getItem('wos_search_' + qid) || 'null')); } catch { return null; } + })(); + if (!qid || !sid) { + return { + streamText: '', + debug: { + href, + qid, + pageNumber, + sort, + sid, + hasSearchState: !!searchState, + searchMode: searchState?.mode || ${JSON.stringify(defaultMode)}, + product: ${JSON.stringify(toProduct(database))}, + reason: 'missing-qid-or-sid', + }, + }; + } + + const payload = { + qid, + retrieve: { + first: Math.max(1, ((pageNumber - 1) * ${MAX_LIMIT}) + 1), + sort, + count: ${MAX_LIMIT}, + jcr: true, + highlight: false, + analyzes: [], + }, + product: ${JSON.stringify(toProduct(database))}, + searchMode: searchState?.mode || ${JSON.stringify(defaultMode)}, + viewType: 'records', + }; + + const res = await fetch('/api/wosnx/core/runQueryGetRecordsStream?SID=' + encodeURIComponent(sid), { + method: 'POST', + credentials: 'include', + headers: { + accept: 'application/json, text/plain, */*', + 'content-type': 'application/json', + }, + body: JSON.stringify(payload), + }); + + const streamText = await res.text(); + return { + streamText, + debug: { + href, + qid, + pageNumber, + sort, + sid, + hasSearchState: !!searchState, + searchMode: searchState?.mode || ${JSON.stringify(defaultMode)}, + product: ${JSON.stringify(toProduct(database))}, + responseOk: res.ok, + responseStatus: res.status, + textSnippet: String(streamText || '').slice(0, 500), + }, + }; + })()`); + } + await page.wait(6); + + let first = await fetchOnce(); + let records = extractRecords(parseWosEventStream(String(first?.streamText || ''))); + if (!records.length) { + await page.wait(4); + const second = await fetchOnce(); + records = extractRecords(parseWosEventStream(String(second?.streamText || ''))); + if (!records.length && process.env.OPENCLI_WOS_DEBUG_SUMMARY === '1') { + throw new CommandExecutionError(`Web of Science summary stream returned no records: ${JSON.stringify({ + first: first?.debug || {}, + second: second?.debug || {}, + })}`); + } + } + + return records; +} + +export async function fetchSummaryStreamRecords( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + evaluate: (js: string) => Promise; + tabs?: () => Promise; + selectTab?: (index: number) => Promise; + }, + url: string, + database: WosDatabase, + limit: number, + defaultMode: string, +): Promise { + const targetPath = new URL(url).pathname; + const summaryMarker = '/summary/'; + await page.goto(url, { settleMs: 5000 }); + if (typeof page.tabs === 'function' && typeof page.selectTab === 'function') { + try { + const tabs = await page.tabs(); + const matching = Array.isArray(tabs) + ? tabs.find(tab => { + const href = String(tab?.url || ''); + return typeof tab?.index === 'number' && (href.includes(targetPath) || href.includes(summaryMarker)); + }) + : undefined; + if (matching && typeof matching.index === 'number') { + await page.selectTab(matching.index); + } + } catch { + // Best-effort: stay on current tab if tab discovery fails. + } + } + try { + const href = String(await page.evaluate(`(() => String(location.href || ''))()` ) || ''); + if (!href.includes(summaryMarker)) { + await page.evaluate(`(() => { location.href = ${JSON.stringify(url)}; return true; })()`); + await page.wait(6); + } + } catch { + // Ignore navigation verification failures and let fetch diagnostics handle it. + } + return fetchCurrentSummaryStreamRecords(page, database, limit, defaultMode); +} + +export async function scrapeBodyTextAndLinks( + page: { + goto: (url: string, options?: Record) => Promise; + wait: (seconds: number) => Promise; + evaluate: (js: string) => Promise; + }, + url: string, +): Promise<{ bodyText: string; links: Array<{ label?: string; url?: string }> }> { + await page.goto(url, { settleMs: 5000 }); + const readOnce = () => page.evaluate(`(() => { + const normalize = (text) => String(text || '').replace(/\\u00a0/g, ' ').replace(/\\s+/g, ' ').trim(); + const links = Array.from(document.querySelectorAll('a')) + .map((el) => ({ + label: normalize(el.textContent || el.getAttribute('aria-label') || ''), + url: String(el.href || '').trim(), + })) + .filter((item) => item.url); + return { + bodyText: String(document.body.innerText || '').replace(/\\u00a0/g, ' '), + links, + }; + })()`); + + for (let attempt = 0; attempt < 3; attempt++) { + await page.wait(2 + attempt); + const result = await readOnce(); + const bodyText = typeof result?.bodyText === 'string' ? result.bodyText : ''; + const links = Array.isArray(result?.links) ? result.links : []; + if (bodyText.trim()) { + return { bodyText, links }; + } + } + + return { bodyText: '', links: [] }; +} diff --git a/src/clis/webofscience/smart-search.test.ts b/src/clis/webofscience/smart-search.test.ts new file mode 100644 index 00000000..47eac858 --- /dev/null +++ b/src/clis/webofscience/smart-search.test.ts @@ -0,0 +1,329 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { EmptyResultError } from '../../errors.js'; +import { getRegistry } from '../../registry.js'; +import './smart-search.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + waitForCapture: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('webofscience smart-search', () => { + it('describes natural-language queries and the default database in command help', () => { + const cmd = getRegistry().get('webofscience/smart-search'); + const queryArg = cmd?.args.find(arg => arg.name === 'query'); + const databaseArg = cmd?.args.find(arg => arg.name === 'database'); + + expect(queryArg?.help).toContain('e.g.'); + expect(queryArg?.help).toContain('machine learning'); + expect(databaseArg?.help).toContain('Defaults to woscc'); + }); + + it('retries once when SID is missing, then maps records from runQuerySearch', async () => { + const cmd = getRegistry().get('webofscience/smart-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: null, href: 'https://webofscience.clarivate.cn/wos/woscc/smart-search' }, + { sid: 'SID123', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QID123', + RecordsFound: 685661, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:001335131500001', + doi: '10.1016/j.patter.2024.101046', + titles: { + item: { en: [{ title: 'Avoiding common machine learning pitfalls' }] }, + source: { en: [{ title: 'PATTERNS' }] }, + }, + names: { + author: { + en: [ + { first_name: 'Michael A.', last_name: 'Lones' }, + { wos_standard: 'Doe, J' }, + ], + }, + }, + pub_info: { pubyear: '2024' }, + citation_related: { counts: { WOSCC: 64 } }, + }, + 2: { + ut: 'WOS:001527924800002', + doi: '', + titles: { + item: { en: [{ title: 'Another machine learning paper' }] }, + source: { en: [{ title: 'JOURNAL OF TESTS' }] }, + }, + names: { + author: { + en: [{ wos_standard: 'Smith, A' }], + }, + }, + pub_info: { pubyear: '2025' }, + citation_related: { counts: { WOSCC: 7 } }, + }, + }, + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'machine learning', limit: 2 }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/woscc/smart-search', + { settleMs: 4000 }, + ); + expect(page.typeText).toHaveBeenCalledWith('#composeQuerySmartSearch', 'machine learning'); + expect(page.click).toHaveBeenCalledTimes(2); + expect(result).toEqual([ + { + rank: 1, + title: 'Avoiding common machine learning pitfalls', + authors: 'Lones, Michael A.; Doe, J', + year: '2024', + source: 'PATTERNS', + citations: 64, + doi: '10.1016/j.patter.2024.101046', + url: 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:001335131500001', + }, + { + rank: 2, + title: 'Another machine learning paper', + authors: 'Smith, A', + year: '2025', + source: 'JOURNAL OF TESTS', + citations: 7, + doi: '', + url: 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:001527924800002', + }, + ]); + }); + + it('throws EmptyResultError when the records payload is empty', async () => { + const cmd = getRegistry().get('webofscience/smart-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SID123', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QID123', + RecordsFound: 0, + }, + }, + { + key: 'records', + payload: {}, + }, + ], + ]); + + await expect(cmd!.func!(page, { query: 'nohits', limit: 5 })).rejects.toThrow(EmptyResultError); + expect(page.click).toHaveBeenCalledTimes(1); + }); + + it('uses the ALLDB smart-search route and payload when database=alldb', async () => { + const cmd = getRegistry().get('webofscience/smart-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SID999', href: 'https://webofscience.clarivate.cn/wos/alldb/summary/test/relevance/1' }, + [ + { + key: 'searchInfo', + payload: { + QueryID: 'QID999', + RecordsFound: 1, + }, + }, + { + key: 'records', + payload: { + 1: { + ut: 'WOS:009999999999999', + doi: '10.1000/alldb.1', + titles: { + item: { en: [{ title: 'All databases record' }] }, + source: { en: [{ title: 'MULTIDATABASE JOURNAL' }] }, + }, + names: { + author: { + en: [{ wos_standard: 'Zhang, S' }], + }, + }, + pub_info: { pubyear: '2026' }, + citation_related: { counts: { WOSCC: 3 } }, + }, + }, + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'quantum', database: 'alldb', limit: 1 }); + + expect(page.goto).toHaveBeenCalledWith( + 'https://webofscience.clarivate.cn/wos/alldb/smart-search', + { settleMs: 4000 }, + ); + + const runQuerySearchJs = vi.mocked(page.evaluate).mock.calls[1]?.[0]; + expect(runQuerySearchJs).toContain('/api/wosnx/core/runQuerySearch?SID='); + expect(runQuerySearchJs).toContain('"product":"ALLDB"'); + expect(runQuerySearchJs).toContain('"database":"ALLDB"'); + + expect(result).toEqual([ + { + rank: 1, + title: 'All databases record', + authors: 'Zhang, S', + year: '2026', + source: 'MULTIDATABASE JOURNAL', + citations: 3, + doi: '10.1000/alldb.1', + url: 'https://webofscience.clarivate.cn/wos/alldb/full-record/WOS:009999999999999', + }, + ]); + }); + + it('skips null authors when formatting records', async () => { + const cmd = getRegistry().get('webofscience/smart-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { sid: 'SID321', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:001', + titles: { + item: { en: [{ title: 'Null author test' }] }, + }, + names: { + author: { + en: [null, { wos_standard: 'Doe, J' }, null], + }, + }, + }, + }, + }, + ], + ]); + + const result = await cmd!.func!(page, { query: 'test', limit: 1 }); + + expect(result).toEqual([ + { + rank: 1, + title: 'Null author test', + authors: 'Doe, J', + year: '', + source: '', + citations: 0, + doi: '', + url: 'https://webofscience.clarivate.cn/wos/woscc/full-record/WOS:001', + }, + ]); + }); + + it('falls back to Enter when the submit button is unavailable', async () => { + const cmd = getRegistry().get('webofscience/smart-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + null, + { sid: 'SID654', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:002', + titles: { + item: { en: [{ title: 'Fallback submit test' }] }, + }, + }, + }, + }, + ], + ]); + vi.mocked(page.click).mockRejectedValueOnce(new Error('Element not found')); + + const result = await cmd!.func!(page, { query: 'fallback', limit: 1 }) as Array<{ title: string }>; + + expect(page.pressKey).toHaveBeenCalledWith('Enter'); + expect(result[0]).toMatchObject({ title: 'Fallback submit test' }); + }); + + it('retries typing when the search input is not ready on first attempt', async () => { + const cmd = getRegistry().get('webofscience/smart-search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + 'opencli-search-input', + { sid: 'SID765', href: 'https://webofscience.clarivate.cn/wos/woscc/summary/test/relevance/1' }, + [ + { + key: 'records', + payload: { + 1: { + ut: 'WOS:005', + titles: { + item: { en: [{ title: 'Retry input test' }] }, + }, + }, + }, + }, + ], + ]); + vi.mocked(page.typeText).mockRejectedValueOnce(new Error('Element not found')); + + const result = await cmd!.func!(page, { query: 'retry', limit: 1 }) as Array<{ title: string }>; + + expect(page.typeText).toHaveBeenCalledTimes(2); + expect(result[0]).toMatchObject({ title: 'Retry input test' }); + }); + + it('does not keep the legacy search command registered', () => { + expect(getRegistry().get('webofscience/search')).toBeUndefined(); + }); +}); diff --git a/src/clis/webofscience/smart-search.ts b/src/clis/webofscience/smart-search.ts new file mode 100644 index 00000000..71c0a3ea --- /dev/null +++ b/src/clis/webofscience/smart-search.ts @@ -0,0 +1,73 @@ +import { cli, Strategy } from '../../registry.js'; +import { ArgumentError, EmptyResultError } from '../../errors.js'; +import { + buildSearchPayload, + clampLimit, + ensureSearchSession, + extractRecords, + firstTitle, + formatAuthors, + fullRecordUrl, + normalizeDatabase, + smartSearchUrl, +} from './shared.js'; + +cli({ + site: 'webofscience', + name: 'smart-search', + description: 'Search Web of Science via the Smart Search page', + domain: 'webofscience.clarivate.cn', + strategy: Strategy.UI, + browser: true, + navigateBefore: false, + args: [ + { name: 'query', positional: true, required: true, help: 'Natural-language or fielded query, e.g. machine learning or TS=(machine learning)' }, + { name: 'database', required: false, help: 'Database to search. Defaults to woscc.', choices: ['woscc', 'alldb'] }, + { name: 'limit', type: 'int', default: 10, help: 'Max results (max 50)' }, + ], + columns: ['rank', 'title', 'authors', 'year', 'source', 'citations', 'doi', 'url'], + func: async (page, kwargs) => { + const query = String(kwargs.query ?? '').trim(); + if (!query) { + throw new ArgumentError('Search query is required'); + } + + const database = normalizeDatabase(kwargs.database); + const limit = clampLimit(kwargs.limit); + const sid = await ensureSearchSession(page, database, query); + const payload = buildSearchPayload(query, limit, database); + + const events = await page.evaluate(`(async () => { + const payload = ${JSON.stringify(payload)}; + const res = await fetch('/api/wosnx/core/runQuerySearch?SID=' + encodeURIComponent(${JSON.stringify(sid)}), { + method: 'POST', + credentials: 'include', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(payload) + }); + return res.json(); + })()`); + + const records = extractRecords(events) + .slice(0, limit) + .map((record, index) => ({ + rank: index + 1, + title: firstTitle(record, 'item'), + authors: formatAuthors(record), + year: record.pub_info?.pubyear ?? '', + source: firstTitle(record, 'source'), + citations: record.citation_related?.counts?.WOSCC ?? 0, + doi: record.doi ?? '', + url: record.ut ? fullRecordUrl(database, record.ut) : '', + })) + .filter(record => record.title); + + if (!records.length) { + throw new EmptyResultError('webofscience smart-search', 'Try a different keyword or verify your Web of Science access in Chrome'); + } + + return records; + }, +}); + +export { smartSearchUrl }; diff --git a/tests/e2e/management.test.ts b/tests/e2e/management.test.ts index 3e3ab19f..1aa215ec 100644 --- a/tests/e2e/management.test.ts +++ b/tests/e2e/management.test.ts @@ -31,9 +31,29 @@ describe('management commands E2E', () => { expect(stdout).toContain('hackernews'); expect(stdout).toContain('bilibili'); expect(stdout).toContain('twitter'); + expect(stdout).toContain('webofscience'); expect(stdout).toContain('commands across'); }); + it('list includes the Web of Science adapter commands', async () => { + const { stdout, code } = await runCli(['list', '-f', 'json']); + expect(code).toBe(0); + const data = parseJsonOutput(stdout); + const webofscience = data.filter((entry: any) => entry.site === 'webofscience'); + + expect(webofscience).toEqual( + expect.arrayContaining([ + expect.objectContaining({ site: 'webofscience', name: 'smart-search' }), + expect.objectContaining({ site: 'webofscience', name: 'basic-search' }), + expect.objectContaining({ site: 'webofscience', name: 'author-search' }), + expect.objectContaining({ site: 'webofscience', name: 'author-record' }), + expect.objectContaining({ site: 'webofscience', name: 'references' }), + expect.objectContaining({ site: 'webofscience', name: 'citing-articles' }), + expect.objectContaining({ site: 'webofscience', name: 'record' }), + ]), + ); + }); + it('list -f yaml produces valid yaml', async () => { const { stdout, code } = await runCli(['list', '-f', 'yaml']); expect(code).toBe(0); diff --git a/tests/smoke/api-health.test.ts b/tests/smoke/api-health.test.ts index 59bf7ffe..3b0625ea 100644 --- a/tests/smoke/api-health.test.ts +++ b/tests/smoke/api-health.test.ts @@ -124,6 +124,7 @@ describe('API health smoke tests', () => { 'coupang', 'xiaohongshu', 'yahoo-finance', + 'webofscience', ]) { expect(sites.has(expected)).toBe(true); }