diff --git a/.changeset/jsonl-ingestion.md b/.changeset/jsonl-ingestion.md new file mode 100644 index 0000000..b92ce8e --- /dev/null +++ b/.changeset/jsonl-ingestion.md @@ -0,0 +1,17 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +"frontend": patch +--- + +Accept JSON-Lines (JSONL) experiment data, not just a single JSON array. Several jsPsych labs — and JATOS exports — write data as newline-delimited JSON, with one JSON value per line (typically one participant's full trial array per line) rather than one big array. Previously `generate()` ran `JSON.parse` on the whole string, so every such file failed with `Unexpected non-whitespace character after JSON` and produced no metadata. + +A new exported `parseJsonData` helper handles both shapes: a well-formed single document is returned unchanged (no behaviour change for existing single-array callers), and only when whole-string parsing fails does it fall back to parsing line by line, flattening any per-line arrays into one observation stream. It is now used wherever JSON data files are parsed: + +- `generate()` (the library) for the main ingestion path. +- the CLI's data-file reader, join-key pre-pass, and CSV-conversion path. +- the frontend's join-key pre-flight and Psych-DS file builder. + +The `.jsonl` file extension is now also recognised as a JSON data file (these exports are conventionally named `.jsonl`). The CLI processes `.jsonl` exactly like `.json` — including filename-normalization, raw-original preservation, and CSV conversion — and the frontend normalises a `.jsonl` upload to the JSON path. + +Verified end to end against the raw `.jsonl` exports in `vucml/online_experiments`: all 15 files now generate metadata and pass the Psych-DS validator with zero errors (they failed at parse time before). diff --git a/.changeset/jsonl-source-record-id.md b/.changeset/jsonl-source-record-id.md new file mode 100644 index 0000000..abc77b3 --- /dev/null +++ b/.changeset/jsonl-source-record-id.md @@ -0,0 +1,15 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +"frontend": patch +--- + +Synthesize a `source_record_id` join key for multi-record JSON-Lines exports. Raw jsPsych exports carry no per-row identifier, so once JSONL is flattened (one record per line) `trial_index` repeats across records and can't uniquely key the extracted array/object sidecar CSVs — every record's trial 0 collapsed onto the same `(trial_index, element_index)` key, making the sidecars impossible to join back to a single parent trial. + +The synthesized column is named `source_record_id` rather than `participant_id` because a JSON-Lines line is only guaranteed to be one *source record* — usually, but not always, one participant. The honest name avoids overclaiming for exports where a line isn't a single subject. + +`parseJsonData` now takes an opt-in `{ tagSourceRecordId }` flag: in the JSON-Lines path it stamps each line's object rows with a 0-based `source_record_id` (a no-op on the single-array fast path), and reports via an optional `stats` out-param whether it actually synthesized the id. A line that already carries a `source_record_id` or a real `participant_id` is left untouched — the experiment's own identifier already groups those rows. `generate()` enables this for JSON input and promotes the identifier to the leading join key, preferring the synthesized `source_record_id` and falling back to a real `participant_id` already present in the export (`['source_record_id', 'trial_index']` or `['participant_id', 'trial_index']`), so the sidecars join unambiguously. CSV inputs are unaffected. + +When — and only when — the id was actually synthesized (i.e. absent from the source), it is given an explicit description that makes its synthetic origin unmistakable ("Synthetic source-record identifier … NOT a real subject ID from the experiment …") so a downstream user can't mistake it for a real subject ID; this also avoids serializing an empty `{}` description (an object with no `@type`, which trips the validator's `OBJECT_TYPE_MISSING`). The CLI's join-key pre-analysis/prompt and the frontend's pre-flight mirror this promotion so multi-record JSONL is no longer falsely flagged as having a non-unique join key. + +Verified end to end against the raw `.jsonl` exports in `vucml/online_experiments` (`block_cat`): the combined 30-record export generates metadata, passes the Psych-DS validator (0 errors), synthesizes `source_record_id` 0–29, and writes sidecars whose `(source_record_id, trial_index, element_index)` keys are fully unique — including the doubly-nested `recall_responses` case. Notably `subjectId` collides across the two merged datasets (two records share `601`), which `source_record_id` correctly keeps distinct. diff --git a/packages/cli/src/data.ts b/packages/cli/src/data.ts index f10460f..0b557b7 100644 --- a/packages/cli/src/data.ts +++ b/packages/cli/src/data.ts @@ -1,9 +1,17 @@ import fs from "fs"; import path from "path"; -import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata"; +import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, parseJsonData, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata"; import { expandHomeDir, disambiguateFilename, fileStem } from "./utils"; import { PlannedFile } from "./rename"; +/** + * JSON-family data extensions. `.jsonl` (JSON-Lines) is treated exactly like `.json`: + * parseJsonData() accepts both a single array and one-JSON-value-per-line, so a `.jsonl` + * file flows through the same code path and generate('json') call as a `.json` file. + */ +export const isJsonDataExt = (ext: string): boolean => ext === '.json' || ext === '.jsonl'; +export const isDataExt = (ext: string): boolean => isJsonDataExt(ext) || ext === '.csv'; + /** * Thrown when the data a file produces doesn't match the output-name plan the user approved * (a column appears/disappears, or an approved name is already taken). Distinct from an @@ -93,37 +101,56 @@ async function collectDataFiles( */ export async function preAnalyzeDirectory( directoryPath: string, - initialKeys: string[] = ['trial_index'] -): Promise<{ parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string } | null> { + initialKeys: string[] = ['trial_index'], + // Optional out-param: set to true if any JSON-Lines file gets a synthesized source_record_id. + // Surfaced this way (rather than via the return value) so the existing return contract — and + // its "no problem found → null" callers — stays unchanged. + outStats?: { synthesizedSourceRecordId?: boolean } +): Promise<{ parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string; keys: string[] } | null> { directoryPath = expandHomeDir(directoryPath); const collected = await collectDataFiles(directoryPath); if (!collected) return null; const { files: filePaths } = collected; - let worst: { parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string } | null = null; + let worst: { parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string; keys: string[] } | null = null; for (const { filePath, name } of filePaths) { if (name === 'dataset_description.json') continue; const ext = path.extname(name).toLowerCase(); - if (ext !== '.json' && ext !== '.csv') continue; + if (!isDataExt(ext)) continue; try { const content = await fs.promises.readFile(filePath, 'utf8'); let parsedData: Array>; - if (ext === '.json') { - const raw = JSON.parse(content); + if (isJsonDataExt(ext)) { + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array) so the + // analysis below sees the same rows generate() will. + const stats: { synthesizedSourceRecordId?: boolean } = {}; + const raw = parseJsonData(content, { tagSourceRecordId: true }, stats); if (!Array.isArray(raw)) continue; + if (stats.synthesizedSourceRecordId && outStats) outStats.synthesizedSourceRecordId = true; parsedData = raw as Array>; } else { parsedData = (await parseCSV(content)) as Array>; } - const analysis = analyzeJoinKeys(parsedData, initialKeys); + // Mirror generate()'s join-key promotion so the prompt is built from the keys generate() + // will actually use: an identifier column — source_record_id synthesized from JSON-Lines, + // else a real participant_id already in the export — becomes the leading join key, and the + // uniqueness check accounts for it. + const idColumn = isJsonDataExt(ext) + ? (['source_record_id', 'participant_id'] as const).find((col) => + !initialKeys.includes(col) && + parsedData.some((row) => row && typeof row === 'object' && col in row)) + : undefined; + const keys = idColumn ? [idColumn, ...initialKeys] : initialKeys; + + const analysis = analyzeJoinKeys(parsedData, keys); if (!analysis.isUnique && (worst === null || analysis.duplicateCount > worst.analysis.duplicateCount)) { - worst = { parsedData, analysis, fileName: name }; + worst = { parsedData, analysis, fileName: name, keys }; } } catch { continue; @@ -247,7 +274,7 @@ export async function analyzeOutputColumns( for (const { filePath, name } of files) { const ext = path.extname(name).toLowerCase(); - if (ext !== '.json' && ext !== '.csv') continue; + if (!isDataExt(ext)) continue; try { const content = await fs.promises.readFile(filePath, 'utf8'); @@ -255,8 +282,8 @@ export async function analyzeOutputColumns( metadata.loadMetadata(content); continue; } - if (ext === '.json') { - if (!Array.isArray(JSON.parse(content))) continue; // non-array JSON is skipped by the writer too + if (isJsonDataExt(ext)) { + if (!Array.isArray(parseJsonData(content, { tagSourceRecordId: true }))) continue; // non-array JSON is skipped by the writer too await metadata.generate(content, {}, 'json', options); } else { await metadata.generate(content, {}, 'csv', options); @@ -317,6 +344,7 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil switch (fileExtension){ case '.json': + case '.jsonl': if (file === "dataset_description.json") metadata.loadMetadata(content); // need to remove this for the files that are being called with the CLI else await metadata.generate(content, {}, 'json', options); break; @@ -324,7 +352,7 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil await metadata.generate(content, {}, 'csv', options); break; default: - console.error(`"${file}" is not .csv or .json format.`); + console.error(`"${file}" is not .csv, .json, or .jsonl format.`); return false; } @@ -341,8 +369,10 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil // skipped before it reserves an output name — otherwise it would needlessly disambiguate // a later valid file that maps to the same base. let parsed: Array> | null = null; - if (fileExtension === '.json') { - const json = JSON.parse(content); + if (isJsonDataExt(fileExtension)) { + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array) so the main + // CSV carries the same join-key column generate() promotes for the sidecars. + const json = parseJsonData(content, { tagSourceRecordId: true }); if (!Array.isArray(json)) { console.error(`"${file}" is not a JSON array of jsPsych trials; skipping CSV conversion.`); return false; diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 4111211..c89491c 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -6,7 +6,7 @@ import { input, select, checkbox, Separator } from '@inquirer/prompts'; import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, isValidPsychDSDataFilename, toPsychDSValue } from "@jspsych/metadata"; import fs from 'fs'; import path from 'path'; -import { processDirectory, processOptions, saveTextToPath, loadMetadata, preAnalyzeDirectory, resolveJoinKeysNonInteractive, enumerateDataFiles, analyzeOutputColumns, OutputColumns } from "./data"; +import { processDirectory, processOptions, saveTextToPath, loadMetadata, preAnalyzeDirectory, resolveJoinKeysNonInteractive, enumerateDataFiles, analyzeOutputColumns, OutputColumns, isDataExt } from "./data"; import { validateDirectory, validateJson, validatePsychDS } from './validatefunctions'; import { createDirectoryWithStructure } from './handlefiles'; import { fileStem } from './utils'; @@ -539,7 +539,7 @@ async function resolveFilenameNormalization( for (const { filePath, name } of files) { if (name === 'dataset_description.json') continue; const ext = path.extname(name).toLowerCase(); - if (ext !== '.json' && ext !== '.csv') continue; + if (!isDataExt(ext)) continue; const stem = fileStem(name); if (!isValidPsychDSDataFilename(`${stem}_data.csv`)) { @@ -730,22 +730,34 @@ const main = async () => { const canPrompt = !isNonInteractive && !!process.stdin.isTTY && !!process.stdout.isTTY; const { bases: normalizedBases, plan: renamePlan } = await resolveFilenameNormalization(dataDir, canPrompt, outputColumns); - // Pre-flight: check whether default join key (trial_index) is unique. If not, prompt the user - // when we have a terminal; otherwise (fully-flagged headless run) resolve deterministically so - // the run never blocks on an interactive prompt it can't answer. + // Pre-flight: check whether the join key is unique. preAnalyzeDirectory mirrors generate()'s + // source_record_id promotion, so preResult.keys is the effective key set (e.g. + // ['source_record_id', 'trial_index'] for multi-record JSON-Lines) — use it as the basis for + // resolution. If not unique, prompt the user when we have a terminal; otherwise (headless run) + // resolve deterministically so the run never blocks on an interactive prompt it can't answer. const initialKeys = ['trial_index']; - const preResult = await preAnalyzeDirectory(dataDir, initialKeys); - let arrayJoinKeys = initialKeys; + const preStats: { synthesizedSourceRecordId?: boolean } = {}; + const preResult = await preAnalyzeDirectory(dataDir, initialKeys, preStats); + let arrayJoinKeys = preResult?.keys ?? initialKeys; if (preResult && !preResult.analysis.isUnique) { if (canPrompt) { - arrayJoinKeys = await promptJoinKeys(preResult.parsedData, preResult.analysis, initialKeys, preResult.fileName); + arrayJoinKeys = await promptJoinKeys(preResult.parsedData, preResult.analysis, preResult.keys, preResult.fileName); } else { - const resolved = resolveJoinKeysNonInteractive(preResult.analysis, initialKeys, preResult.fileName); + const resolved = resolveJoinKeysNonInteractive(preResult.analysis, preResult.keys, preResult.fileName); arrayJoinKeys = resolved.keys; (resolved.unresolved ? console.warn : console.log)(`${resolved.unresolved ? '⚠' : 'ℹ'} ${resolved.message}`); } } + // Tell the user when we add the synthetic identifier, so the extra column in their output + // isn't a surprise. Only fires for JSON-Lines input that carried no id of its own. + if (preStats.synthesizedSourceRecordId) { + console.log( + 'Detected JSON-Lines input; added synthetic source_record_id to preserve ' + + 'source-record boundaries for extracted nested data.' + ); + } + // The pre-flight prompt above already surfaced any join-key uniqueness issue to the // user, so suppress the library's per-file warning to avoid repeating it. await processDirectory(metadata, dataDir, verbose, `${project_path}/data`, { arrayJoinKeys, suppressJoinKeyWarning: true, normalizedBases, renamePlan: renamePlan ?? undefined }); diff --git a/packages/cli/tests/data.test.ts b/packages/cli/tests/data.test.ts index f0ffac1..580a760 100644 --- a/packages/cli/tests/data.test.ts +++ b/packages/cli/tests/data.test.ts @@ -113,6 +113,23 @@ describe("processDirectory", () => { expect(failed).toBe(0); }); + test("processes a JSON-Lines (.jsonl) file with one participant array per line", async () => { + // JATOS-style export: each line is a full participant array, not one big array. + const p1 = JSON.stringify([{ trial_type: "html-keyboard-response", trial_index: 0, rt: 450 }]); + const p2 = JSON.stringify([{ trial_type: "html-keyboard-response", trial_index: 0, rt: 512 }]); + fs.writeFileSync(path.join(tmpDir, "raw.jsonl"), `${p1}\n${p2}\n`); + + const metadata = new JsPsychMetadata(); + const { total, failed } = await processDirectory(metadata, tmpDir); + + expect(total).toBe(1); + expect(failed).toBe(0); + // rows from both lines were ingested (rt spans both participants). + const rt = metadata.getVariable("rt") as any; + expect(rt.minValue).toBe(450); + expect(rt.maxValue).toBe(512); + }); + test("counts unsupported file types as failed", async () => { fs.writeFileSync(path.join(tmpDir, "notes.txt"), "just a text file"); @@ -216,6 +233,29 @@ describe("preAnalyzeDirectory", () => { expect(result!.analysis.isUnique).toBe(false); }); + test("reports a synthesized source_record_id via the out-param for JSON-Lines input", async () => { + // JSON-Lines (one array per line) with no id column → source_record_id is synthesized. + fs.writeFileSync( + path.join(tmpDir, "jsonl.jsonl"), + `[{"trial_index":0},{"trial_index":1}]\n[{"trial_index":0}]` + ); + + const stats: { synthesizedSourceRecordId?: boolean } = {}; + await preAnalyzeDirectory(tmpDir, ["trial_index"], stats); + expect(stats.synthesizedSourceRecordId).toBe(true); + }); + + test("does not report a synthesized source_record_id for a single JSON array", async () => { + fs.writeFileSync( + path.join(tmpDir, "single.json"), + JSON.stringify([{ trial_index: 0 }, { trial_index: 1 }]) + ); + + const stats: { synthesizedSourceRecordId?: boolean } = {}; + await preAnalyzeDirectory(tmpDir, ["trial_index"], stats); + expect(stats.synthesizedSourceRecordId).toBeUndefined(); + }); + test("parses CSV data files as well as JSON", async () => { fs.writeFileSync(path.join(tmpDir, "dupes.csv"), "trial_index\n0\n0"); diff --git a/packages/frontend/src/pages/DataUpload.tsx b/packages/frontend/src/pages/DataUpload.tsx index 85439ec..070c925 100644 --- a/packages/frontend/src/pages/DataUpload.tsx +++ b/packages/frontend/src/pages/DataUpload.tsx @@ -1,6 +1,6 @@ import { useState, useRef, useEffect } from 'react'; import JSZip from 'jszip'; -import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, parseCSV, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata'; +import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, parseCSV, parseJsonData, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata'; import PageHeader from '../components/PageHeader'; import styles from './DataUpload.module.css'; @@ -173,7 +173,10 @@ const DataUpload: React.FC = ({ const textMap = new Map(); for (const file of files) { - const type = file.name.split('.').pop()?.toLowerCase() || ''; + const rawExt = file.name.split('.').pop()?.toLowerCase() || ''; + // Treat JSON-Lines as JSON: parseJsonData() accepts both a single array and one + // JSON value per line, so .jsonl flows through the same path as .json downstream. + const type = rawExt === 'jsonl' ? 'json' : rawExt; const content = await readFileAsText(file); textMap.set(file.webkitRelativePath || file.name, { content, type }); } @@ -187,9 +190,16 @@ const DataUpload: React.FC = ({ if (type !== 'json') continue; if (name === 'dataset_description.json' || name.endsWith('/dataset_description.json')) continue; try { - const parsed = JSON.parse(content); + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array). + const parsed = parseJsonData(content, { tagSourceRecordId: true }); if (!Array.isArray(parsed) || parsed.length === 0) continue; - const analysis = analyzeJoinKeys(parsed, ['trial_index']); + // Mirror generate()'s join-key promotion so a multi-record JSON-Lines file isn't wrongly + // flagged: trial_index alone repeats across records, but the identifier column (a + // synthesized source_record_id, else a real participant_id) makes (id, trial_index) unique. + const idColumn = (['source_record_id', 'participant_id'] as const).find((col) => + parsed.some((row: any) => row && typeof row === 'object' && col in row)); + const keys = idColumn ? [idColumn, 'trial_index'] : ['trial_index']; + const analysis = analyzeJoinKeys(parsed, keys); if (!analysis.isUnique) { setJoinKeyProblemFile(name); setJoinKeyCandidates(analysis.candidates); @@ -268,7 +278,9 @@ const DataUpload: React.FC = ({ let mainRows: Array> = []; let mainContent: string | undefined; if (type === 'json') { - const json = JSON.parse(content); + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array) so the + // main CSV carries the same join-key column generate() promotes for the sidecars. + const json = parseJsonData(content, { tagSourceRecordId: true }); if (!Array.isArray(json)) { update(i, { status: 'skipped', detail: 'not a jsPsych trial array' }); continue; diff --git a/packages/frontend/tests/dataUploadConversion.test.ts b/packages/frontend/tests/dataUploadConversion.test.ts index 23f8771..b20adb5 100644 --- a/packages/frontend/tests/dataUploadConversion.test.ts +++ b/packages/frontend/tests/dataUploadConversion.test.ts @@ -1,4 +1,4 @@ -import { parseCSV, buildPsychDSDataFiles, deriveFallbackBase } from "@jspsych/metadata"; +import { parseCSV, parseJsonData, buildPsychDSDataFiles, deriveFallbackBase } from "@jspsych/metadata"; // Mirrors the CSV branch of DataUpload.runGenerate: parse the uploaded CSV into mainRows and // hand it to the shared builder. Guards the frontend wiring (the parseCSV call + builder usage) @@ -28,3 +28,26 @@ describe("frontend CSV → Psych-DS conversion (the runGenerate path)", () => { expect(built.find((f) => f.kind === "main")!.content).toBe(content); }); }); + +// Mirrors the JSON branch of DataUpload.runGenerate for a .jsonl upload: the file's `type` +// is normalised to 'json', then parseJsonData flattens the per-line participant arrays into +// mainRows before the shared builder serialises them to one converted data/*.csv. +describe("frontend JSON-Lines → Psych-DS conversion (the runGenerate path)", () => { + it("flattens a .jsonl export (one participant array per line) into one main CSV", () => { + const p1 = JSON.stringify([{ trial_type: "html-keyboard-response", rt: 450 }]); + const p2 = JSON.stringify([{ trial_type: "html-keyboard-response", rt: 512 }]); + const content = `${p1}\n${p2}\n`; + + const mainRows = parseJsonData(content) as Array>; + expect(mainRows).toHaveLength(2); + + const built = buildPsychDSDataFiles({ base: deriveFallbackBase("raw"), mainRows }); + const main = built.find((f) => f.kind === "main")!; + const lines = main.content.split(/\r?\n/).filter(Boolean); + expect(lines[0].split(",")).toEqual(["trial_type", "rt"]); + // Both participant lines became data rows. + expect(lines).toHaveLength(3); + expect(main.content).toContain("450"); + expect(main.content).toContain("512"); + }); +}); diff --git a/packages/metadata/src/index.ts b/packages/metadata/src/index.ts index 0d64edd..a1e17ad 100644 --- a/packages/metadata/src/index.ts +++ b/packages/metadata/src/index.ts @@ -1,6 +1,6 @@ import { AuthorFields, AuthorsMap } from "./AuthorsMap"; import { PluginCache } from "./PluginCache"; -import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils"; +import { saveTextToFile, parseCSV, tryParseJSON, parseJsonData, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils"; import { VariableFields, VariablesMap } from "./VariablesMap"; /** @@ -438,8 +438,18 @@ export default class JsPsychMetadata { parsed_data = await parseCSV(data); } + let synthesizedSourceRecordId = false; if (ext === 'json') { - parsed_data = JSON.parse(data); + // Accepts both a single JSON array (standard jsPsych export) and JSON-Lines, + // where each line is its own JSON value (JATOS exports one participant array per line). + // Tag JSON-Lines rows with a per-line source_record_id: raw jsPsych exports carry no + // per-row identifier, so in multi-record JSONL trial_index alone repeats across records + // and can't uniquely key the extracted sidecar CSVs. The stat records whether we actually + // invented the id (vs. the data already carrying one), so we only describe it as + // synthetic when it truly is. + const parseStats: { synthesizedSourceRecordId?: boolean } = {}; + parsed_data = parseJsonData(data, { tagSourceRecordId: true }, parseStats); + synthesizedSourceRecordId = parseStats.synthesizedSourceRecordId === true; } if (!Array.isArray(parsed_data)) { @@ -459,6 +469,23 @@ export default class JsPsychMetadata { ); } + // When JSON rows carry an identifier column, promote it to the leading join key (unless the + // caller already listed it). Prefer source_record_id (synthesized per line from JSON-Lines + // above) and otherwise fall back to a real participant_id already present in the export. Raw + // jsPsych exports otherwise have no per-row identifier, so trial_index alone repeats across + // records and can't uniquely key the extracted sidecar CSVs; (id, trial_index, …) restores a + // one-trial-per-key join. CSV inputs are left untouched, preserving existing behaviour for + // tabular sources. + const rows = parsed_data as Array>; + const hasColumn = (col: string) => + ext === 'json' && rows.some((row) => row && typeof row === 'object' && col in row); + const idColumn = hasColumn('source_record_id') ? 'source_record_id' + : hasColumn('participant_id') ? 'participant_id' + : undefined; + if (idColumn && !this.arrayJoinKeys.includes(idColumn)) { + this.arrayJoinKeys = [idColumn, ...this.arrayJoinKeys]; + } + // Callers that already surface join-key uniqueness to the user (e.g. the CLI's // interactive pre-analysis prompt) can suppress this warning to avoid repeating it // once per file. @@ -469,6 +496,22 @@ export default class JsPsychMetadata { await this.generateObservation(observation); } + // Only when WE synthesized source_record_id (it wasn't in the source) do we own its + // description. As an identifier/join-key column it isn't plugin-documented, so per-trial + // processing leaves it with only "unknown" plugin descriptions that getList() strips to an + // empty {} (an object with no @type → OBJECT_TYPE_MISSING). Give it one explicit + // description that makes its synthetic origin unmistakable, so a downstream user never + // mistakes it for a real subject ID. A pre-existing participant_id is left untouched — its + // meaning is the experiment's, not ours. Done before updateMetadata so a caller-supplied + // metadata override still wins. + if (synthesizedSourceRecordId && this.containsVariable('source_record_id')) { + const existing = this.getVariable('source_record_id') as VariableFields; + this.setVariable({ + ...existing, + description: { default: 'Synthetic source-record identifier (0-based), assigned one per source record (one JSON-Lines line, which is usually but not always one participant) because the raw data carried no identifier column. NOT a real subject ID from the experiment — it only orders/links records as they appeared in the source file, and serves as a join key connecting each trial to its extracted array/object rows.' }, + }); + } + await this.updateMetadata(metadata); } @@ -912,7 +955,7 @@ export default class JsPsychMetadata { // Declare the join-key columns this table carries that aren't known yet: element_index, plus // any ancestor element-index keys passed down from an enclosing array (qualified - // ".element_index"). Pre-existing keys (trial_index, participant_id, …) are already + // ".element_index"). Pre-existing keys (trial_index, source_record_id, …) are already // declared and are skipped. if (!this.containsVariable("element_index")) { this.setVariable({ @@ -1047,5 +1090,5 @@ export { AuthorFields, VariableFields } -export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils"; +export { analyzeJoinKeys, parseCSV, parseJsonData, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils"; export type { JoinKeyAnalysis, PsychDSDataFile, BuildPsychDSDataFilesArgs } from "./utils"; diff --git a/packages/metadata/src/utils.ts b/packages/metadata/src/utils.ts index 279cf15..5581395 100644 --- a/packages/metadata/src/utils.ts +++ b/packages/metadata/src/utils.ts @@ -78,6 +78,81 @@ export function tryParseJSON(value: string): any | null { } } +/** + * Parses experiment data that is either a single JSON document (the standard jsPsych + * export — one array of trials, possibly pretty-printed) or JSON-Lines: one JSON value + * per line, as JATOS and several labs export it (typically one participant's trial + * array per line). Returns a flat array of observations in both cases. + * + * A well-formed single document is returned as-is (arrays untouched, so existing + * single-array callers see no change). Only when whole-string parsing fails do we fall + * back to line-by-line parsing, flattening any per-line arrays into one observation + * stream. Throws a descriptive error when the input is neither valid JSON nor valid JSONL. + * + * When `tagSourceRecordId` is set, `stats.synthesizedSourceRecordId` is set to true iff a + * source_record_id was actually stamped onto at least one row (i.e. the data did not already + * carry a source_record_id or a real participant_id). Callers use this to describe the column + * honestly — a synthesized id marks the source record/line, not a real subject identifier, and + * must not be presented as one. + */ +export function parseJsonData( + content: string, + options: { tagSourceRecordId?: boolean } = {}, + stats?: { synthesizedSourceRecordId?: boolean } +): any { + // Fast path: a single, well-formed JSON document. Covers the standard single array + // (including pretty-printed/multi-line) with no behaviour change for existing callers. + // Note: tagSourceRecordId never applies here — a single document has no line boundaries + // to identify source records by, so its rows are returned untouched. + const whole = tryParseJSON(content); + if (whole !== null) return whole; + + // Fallback: JSON-Lines. Each non-empty line must be its own JSON value; per-line + // arrays are concatenated so a multi-participant export becomes one observation array. + const lines = content.split(/\r?\n/); + const out: any[] = []; + let parsedAny = false; + let recordIndex = 0; + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + if (!line) continue; + let value; + try { + value = JSON.parse(line); + } catch { + throw new Error( + `Could not parse data as JSON or JSON-Lines: line ${i + 1} is not valid JSON.` + ); + } + parsedAny = true; + const observations = Array.isArray(value) ? value : [value]; + // In JSON-Lines each line is typically one participant's submission (JATOS-style export), + // but a line is only guaranteed to be one *source record* — the per-line boundary is the + // only identifier these raw jsPsych exports carry. So — when asked — stamp every object + // observation from this line with a 0-based source_record_id before that boundary is lost + // in the flattened stream. This lets nested array/object extraction form a unique + // (source_record_id, trial_index) join key. Rows that already carry a source_record_id or a + // real participant_id are left untouched (the experiment's own id already groups them); + // non-object lines (bare primitives) can't carry the tag. + if (options.tagSourceRecordId) { + for (const obs of observations) { + if (obs !== null && typeof obs === "object" && !Array.isArray(obs) && + !("source_record_id" in obs) && !("participant_id" in obs)) { + obs.source_record_id = recordIndex; + if (stats) stats.synthesizedSourceRecordId = true; + } + } + } + out.push(...observations); + recordIndex++; + } + + if (!parsedAny) { + throw new Error("Could not parse data: input is empty or not valid JSON/JSON-Lines."); + } + return out; +} + /** System columns excluded from join-key candidate detection; also used to initialise ignored_variables in JsPsychMetadata. */ export const SYSTEM_COLUMNS = new Set([ 'trial_type', 'trial_index', 'time_elapsed', 'extension_type', 'extension_version', diff --git a/packages/metadata/tests/jsonl-ingestion.test.ts b/packages/metadata/tests/jsonl-ingestion.test.ts new file mode 100644 index 0000000..59262a5 --- /dev/null +++ b/packages/metadata/tests/jsonl-ingestion.test.ts @@ -0,0 +1,176 @@ +import JsPsychMetadata from "../src/index"; +import { parseJsonData } from "../src/utils"; + +// JSON-Lines ingestion: several jsPsych labs (and JATOS) export experiment data as +// newline-delimited JSON — one JSON value per line, typically one participant's full +// trial array per line — rather than a single JSON array. generate() / parseJsonData +// must accept both forms and flatten JSONL into one observation stream. + +describe("parseJsonData", () => { + test("returns a standard single JSON array unchanged", () => { + const rows = [{ a: 1 }, { a: 2 }]; + expect(parseJsonData(JSON.stringify(rows))).toEqual(rows); + }); + + test("parses a pretty-printed (multi-line) single array", () => { + const rows = [{ a: 1 }, { a: 2 }]; + expect(parseJsonData(JSON.stringify(rows, null, 2))).toEqual(rows); + }); + + test("flattens JSON-Lines where each line is a participant array", () => { + const p1 = [{ subject: 1, t: 0 }, { subject: 1, t: 1 }]; + const p2 = [{ subject: 2, t: 0 }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}\n`; + expect(parseJsonData(jsonl)).toEqual([...p1, ...p2]); + }); + + test("handles JSON-Lines where each line is a single object", () => { + const jsonl = `{"a":1}\n{"a":2}\n{"a":3}`; + expect(parseJsonData(jsonl)).toEqual([{ a: 1 }, { a: 2 }, { a: 3 }]); + }); + + test("ignores blank lines (incl. CRLF) between records", () => { + const jsonl = `[{"a":1}]\r\n\r\n[{"a":2}]\r\n`; + expect(parseJsonData(jsonl)).toEqual([{ a: 1 }, { a: 2 }]); + }); + + test("throws a descriptive error for a malformed JSONL line", () => { + const jsonl = `[{"a":1}]\nnot json\n[{"a":2}]`; + expect(() => parseJsonData(jsonl)).toThrow(/line 2 is not valid JSON/); + }); + + test("throws for empty input", () => { + expect(() => parseJsonData(" \n ")).toThrow(/empty or not valid/); + }); +}); + +describe("parseJsonData source_record_id tagging", () => { + // Raw jsPsych exports carry no per-row identifier; for multi-record JSON-Lines the line + // boundary is the only one available, so tagSourceRecordId stamps a 0-based source_record_id + // per line (a line is usually one participant, but only guaranteed to be one source record). + // This lets nested array/object extraction form a unique (source_record_id, trial_index) join key. + test("tags a per-line source_record_id across JSON-Lines records", () => { + const p1 = [{ trial_index: 0 }, { trial_index: 1 }]; + const p2 = [{ trial_index: 0 }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + expect(parseJsonData(jsonl, { tagSourceRecordId: true })).toEqual([ + { trial_index: 0, source_record_id: 0 }, + { trial_index: 1, source_record_id: 0 }, + { trial_index: 0, source_record_id: 1 }, + ]); + }); + + test("leaves a single JSON array untouched (no line boundaries to tag by)", () => { + const rows = [{ trial_index: 0 }, { trial_index: 1 }]; + expect(parseJsonData(JSON.stringify(rows), { tagSourceRecordId: true })).toEqual(rows); + }); + + test("does not overwrite an existing source_record_id", () => { + const jsonl = `[{"source_record_id":"R7","trial_index":0}]\n[{"trial_index":0}]`; + expect(parseJsonData(jsonl, { tagSourceRecordId: true })).toEqual([ + { source_record_id: "R7", trial_index: 0 }, + { trial_index: 0, source_record_id: 1 }, + ]); + }); + + test("defers to a real participant_id: tags neither the row that has one", () => { + // A line that already carries a real participant_id is left as-is (the experiment's id + // already groups it); other lines still get a synthesized source_record_id. + const jsonl = `[{"participant_id":"P7","trial_index":0}]\n[{"trial_index":0}]`; + expect(parseJsonData(jsonl, { tagSourceRecordId: true })).toEqual([ + { participant_id: "P7", trial_index: 0 }, + { trial_index: 0, source_record_id: 1 }, + ]); + }); + + test("does not tag when the option is off (default)", () => { + const jsonl = `[{"trial_index":0}]\n[{"trial_index":0}]`; + expect(parseJsonData(jsonl)).toEqual([{ trial_index: 0 }, { trial_index: 0 }]); + }); +}); + +describe("generate() ingests JSON-Lines end to end", () => { + beforeEach(() => { + (global as any).fetch = jest.fn().mockResolvedValue({ text: () => Promise.resolve("") }); + }); + + test("builds variableMeasured from a multi-line (per-participant) JSONL export", async () => { + const p1 = [ + { trial_type: "html-keyboard-response", trial_index: 0, rt: 500, subject: "a" }, + { trial_type: "html-keyboard-response", trial_index: 1, rt: 650, subject: "a" }, + ]; + const p2 = [ + { trial_type: "html-keyboard-response", trial_index: 0, rt: 720, subject: "b" }, + ]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + + const meta = new JsPsychMetadata(); + await meta.generate(jsonl, {}, "json"); + + const names = meta.getMetadata().variableMeasured.map((v: any) => v.name); + // A non-system column from the flattened rows is captured... + expect(names).toContain("rt"); + expect(names).toContain("subject"); + // ...and rt's range spans rows drawn from both participant lines. + const rt = meta.getVariable("rt") as any; + expect(rt.minValue).toBe(500); + expect(rt.maxValue).toBe(720); + }); + + test("synthesizes source_record_id so multi-record JSON-Lines sidecars join uniquely", async () => { + // Both records restart trial_index at 0, so without a per-line identifier the two trial-0 + // view_history rows would collide on (trial_index, element_index). + const p1 = [{ trial_type: "html-keyboard-response", trial_index: 0, view_history: [{ page: 0 }, { page: 1 }] }]; + const p2 = [{ trial_type: "html-keyboard-response", trial_index: 0, view_history: [{ page: 0 }] }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + + const meta = new JsPsychMetadata(); + await meta.generate(jsonl, {}, "json"); + + // source_record_id is promoted to the leading join key. + expect(meta.getArrayJoinKeys()).toEqual(["source_record_id", "trial_index"]); + + // It serialises with a plain-text description (not an empty {} that would trip + // Psych-DS's OBJECT_TYPE_MISSING) that makes its synthetic origin unmistakable, so a + // downstream user can't mistake it for a real subject ID. + const sid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "source_record_id"); + expect(typeof sid.description).toBe("string"); + expect(sid.description.toLowerCase()).toContain("synthetic"); + expect(sid.description.toLowerCase()).toContain("not a real subject id"); + + // Every extracted view_history row carries source_record_id, so the composite key is unique. + const rows = meta.getExtractedArrays().get("view_history") as Array>; + expect(rows.length).toBe(3); + const keyset = new Set(rows.map((r) => `${r.source_record_id}|${r.trial_index}|${r.element_index}`)); + expect(keyset.size).toBe(rows.length); + expect(rows.map((r) => r.source_record_id).sort()).toEqual([0, 0, 1]); + }); + + test("uses a real participant_id as the join key and does not synthesize a source_record_id", async () => { + // Each line already carries its own participant_id — a real identifier. Promotion should + // use it as the join key, no source_record_id should be synthesized, and nothing should be + // relabeled with a "synthetic" description (that would misrepresent a genuine subject ID). + const p1 = [{ trial_type: "html-keyboard-response", trial_index: 0, participant_id: "sub-007" }]; + const p2 = [{ trial_type: "html-keyboard-response", trial_index: 0, participant_id: "sub-008" }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + + const meta = new JsPsychMetadata(); + await meta.generate(jsonl, {}, "json"); + + expect(meta.getArrayJoinKeys()).toEqual(["participant_id", "trial_index"]); + expect(meta.containsVariable("source_record_id")).toBe(false); + const pid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "participant_id"); + const desc = typeof pid.description === "string" ? pid.description : JSON.stringify(pid.description); + expect(desc.toLowerCase()).not.toContain("synthetic"); + }); + + test("does not promote an identifier for a single-array export that lacks one", async () => { + const rows = [ + { trial_type: "html-keyboard-response", trial_index: 0, rt: 1 }, + { trial_type: "html-keyboard-response", trial_index: 1, rt: 2 }, + ]; + const meta = new JsPsychMetadata(); + await meta.generate(JSON.stringify(rows), {}, "json"); + expect(meta.getArrayJoinKeys()).toEqual(["trial_index"]); + }); +});