From 78f120eff102ac7e431a06f83af21bc01c3f86e8 Mon Sep 17 00:00:00 2001 From: Mandyx22 <1915537307@qq.com> Date: Wed, 17 Jun 2026 10:37:00 -0400 Subject: [PATCH 1/3] feat(metadata,cli,frontend): ingest JSON-Lines (JSONL) experiment data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several jsPsych labs and JATOS export data as newline-delimited JSON — one JSON value per line (typically one participant's trial array per line) — rather than a single JSON array. generate() ran JSON.parse on the whole string, so every such file failed with "Unexpected non-whitespace character after JSON" and produced no metadata. The CLI and frontend also filter data files by extension, so .jsonl files were skipped before reaching generate(). - metadata: new exported parseJsonData() helper accepts both a single JSON document (returned unchanged — no behaviour change for existing callers) and JSON-Lines, flattening per-line arrays into one observation stream. Wired into generate(). - cli: treat .jsonl as JSON everywhere (isDataExt/isJsonDataExt) — directory reader, join-key pre-pass, filename-normalization, and CSV conversion. - frontend: normalise .jsonl uploads to the JSON path; join-key pre-flight and file builder use parseJsonData. Verified against the raw .jsonl exports in vucml/online_experiments: all 15 files now generate metadata and pass the Psych-DS validator with zero errors. Co-Authored-By: Claude Opus 4.8 --- .changeset/jsonl-ingestion.md | 17 +++++ packages/cli/src/data.ts | 29 +++++--- packages/cli/src/index.ts | 4 +- packages/cli/tests/data.test.ts | 17 +++++ packages/frontend/src/pages/DataUpload.tsx | 11 ++- .../tests/dataUploadConversion.test.ts | 25 ++++++- packages/metadata/src/index.ts | 8 +- packages/metadata/src/utils.ts | 44 +++++++++++ .../metadata/tests/jsonl-ingestion.test.ts | 74 +++++++++++++++++++ 9 files changed, 209 insertions(+), 20 deletions(-) create mode 100644 .changeset/jsonl-ingestion.md create mode 100644 packages/metadata/tests/jsonl-ingestion.test.ts diff --git a/.changeset/jsonl-ingestion.md b/.changeset/jsonl-ingestion.md new file mode 100644 index 0000000..b92ce8e --- /dev/null +++ b/.changeset/jsonl-ingestion.md @@ -0,0 +1,17 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +"frontend": patch +--- + +Accept JSON-Lines (JSONL) experiment data, not just a single JSON array. Several jsPsych labs — and JATOS exports — write data as newline-delimited JSON, with one JSON value per line (typically one participant's full trial array per line) rather than one big array. Previously `generate()` ran `JSON.parse` on the whole string, so every such file failed with `Unexpected non-whitespace character after JSON` and produced no metadata. + +A new exported `parseJsonData` helper handles both shapes: a well-formed single document is returned unchanged (no behaviour change for existing single-array callers), and only when whole-string parsing fails does it fall back to parsing line by line, flattening any per-line arrays into one observation stream. It is now used wherever JSON data files are parsed: + +- `generate()` (the library) for the main ingestion path. +- the CLI's data-file reader, join-key pre-pass, and CSV-conversion path. +- the frontend's join-key pre-flight and Psych-DS file builder. + +The `.jsonl` file extension is now also recognised as a JSON data file (these exports are conventionally named `.jsonl`). The CLI processes `.jsonl` exactly like `.json` — including filename-normalization, raw-original preservation, and CSV conversion — and the frontend normalises a `.jsonl` upload to the JSON path. + +Verified end to end against the raw `.jsonl` exports in `vucml/online_experiments`: all 15 files now generate metadata and pass the Psych-DS validator with zero errors (they failed at parse time before). diff --git a/packages/cli/src/data.ts b/packages/cli/src/data.ts index f10460f..fafd347 100644 --- a/packages/cli/src/data.ts +++ b/packages/cli/src/data.ts @@ -1,9 +1,17 @@ import fs from "fs"; import path from "path"; -import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata"; +import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, parseJsonData, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata"; import { expandHomeDir, disambiguateFilename, fileStem } from "./utils"; import { PlannedFile } from "./rename"; +/** + * JSON-family data extensions. `.jsonl` (JSON-Lines) is treated exactly like `.json`: + * parseJsonData() accepts both a single array and one-JSON-value-per-line, so a `.jsonl` + * file flows through the same code path and generate('json') call as a `.json` file. + */ +export const isJsonDataExt = (ext: string): boolean => ext === '.json' || ext === '.jsonl'; +export const isDataExt = (ext: string): boolean => isJsonDataExt(ext) || ext === '.csv'; + /** * Thrown when the data a file produces doesn't match the output-name plan the user approved * (a column appears/disappears, or an approved name is already taken). Distinct from an @@ -107,14 +115,14 @@ export async function preAnalyzeDirectory( if (name === 'dataset_description.json') continue; const ext = path.extname(name).toLowerCase(); - if (ext !== '.json' && ext !== '.csv') continue; + if (!isDataExt(ext)) continue; try { const content = await fs.promises.readFile(filePath, 'utf8'); let parsedData: Array>; - if (ext === '.json') { - const raw = JSON.parse(content); + if (isJsonDataExt(ext)) { + const raw = parseJsonData(content); // single array or JSON-Lines if (!Array.isArray(raw)) continue; parsedData = raw as Array>; } else { @@ -247,7 +255,7 @@ export async function analyzeOutputColumns( for (const { filePath, name } of files) { const ext = path.extname(name).toLowerCase(); - if (ext !== '.json' && ext !== '.csv') continue; + if (!isDataExt(ext)) continue; try { const content = await fs.promises.readFile(filePath, 'utf8'); @@ -255,8 +263,8 @@ export async function analyzeOutputColumns( metadata.loadMetadata(content); continue; } - if (ext === '.json') { - if (!Array.isArray(JSON.parse(content))) continue; // non-array JSON is skipped by the writer too + if (isJsonDataExt(ext)) { + if (!Array.isArray(parseJsonData(content))) continue; // non-array JSON is skipped by the writer too await metadata.generate(content, {}, 'json', options); } else { await metadata.generate(content, {}, 'csv', options); @@ -317,6 +325,7 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil switch (fileExtension){ case '.json': + case '.jsonl': if (file === "dataset_description.json") metadata.loadMetadata(content); // need to remove this for the files that are being called with the CLI else await metadata.generate(content, {}, 'json', options); break; @@ -324,7 +333,7 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil await metadata.generate(content, {}, 'csv', options); break; default: - console.error(`"${file}" is not .csv or .json format.`); + console.error(`"${file}" is not .csv, .json, or .jsonl format.`); return false; } @@ -341,8 +350,8 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil // skipped before it reserves an output name — otherwise it would needlessly disambiguate // a later valid file that maps to the same base. let parsed: Array> | null = null; - if (fileExtension === '.json') { - const json = JSON.parse(content); + if (isJsonDataExt(fileExtension)) { + const json = parseJsonData(content); // single array or JSON-Lines (flattened) if (!Array.isArray(json)) { console.error(`"${file}" is not a JSON array of jsPsych trials; skipping CSV conversion.`); return false; diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 4111211..b44a190 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -6,7 +6,7 @@ import { input, select, checkbox, Separator } from '@inquirer/prompts'; import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, isValidPsychDSDataFilename, toPsychDSValue } from "@jspsych/metadata"; import fs from 'fs'; import path from 'path'; -import { processDirectory, processOptions, saveTextToPath, loadMetadata, preAnalyzeDirectory, resolveJoinKeysNonInteractive, enumerateDataFiles, analyzeOutputColumns, OutputColumns } from "./data"; +import { processDirectory, processOptions, saveTextToPath, loadMetadata, preAnalyzeDirectory, resolveJoinKeysNonInteractive, enumerateDataFiles, analyzeOutputColumns, OutputColumns, isDataExt } from "./data"; import { validateDirectory, validateJson, validatePsychDS } from './validatefunctions'; import { createDirectoryWithStructure } from './handlefiles'; import { fileStem } from './utils'; @@ -539,7 +539,7 @@ async function resolveFilenameNormalization( for (const { filePath, name } of files) { if (name === 'dataset_description.json') continue; const ext = path.extname(name).toLowerCase(); - if (ext !== '.json' && ext !== '.csv') continue; + if (!isDataExt(ext)) continue; const stem = fileStem(name); if (!isValidPsychDSDataFilename(`${stem}_data.csv`)) { diff --git a/packages/cli/tests/data.test.ts b/packages/cli/tests/data.test.ts index f0ffac1..d89dcb4 100644 --- a/packages/cli/tests/data.test.ts +++ b/packages/cli/tests/data.test.ts @@ -113,6 +113,23 @@ describe("processDirectory", () => { expect(failed).toBe(0); }); + test("processes a JSON-Lines (.jsonl) file with one participant array per line", async () => { + // JATOS-style export: each line is a full participant array, not one big array. + const p1 = JSON.stringify([{ trial_type: "html-keyboard-response", trial_index: 0, rt: 450 }]); + const p2 = JSON.stringify([{ trial_type: "html-keyboard-response", trial_index: 0, rt: 512 }]); + fs.writeFileSync(path.join(tmpDir, "raw.jsonl"), `${p1}\n${p2}\n`); + + const metadata = new JsPsychMetadata(); + const { total, failed } = await processDirectory(metadata, tmpDir); + + expect(total).toBe(1); + expect(failed).toBe(0); + // rows from both lines were ingested (rt spans both participants). + const rt = metadata.getVariable("rt") as any; + expect(rt.minValue).toBe(450); + expect(rt.maxValue).toBe(512); + }); + test("counts unsupported file types as failed", async () => { fs.writeFileSync(path.join(tmpDir, "notes.txt"), "just a text file"); diff --git a/packages/frontend/src/pages/DataUpload.tsx b/packages/frontend/src/pages/DataUpload.tsx index 85439ec..dde274a 100644 --- a/packages/frontend/src/pages/DataUpload.tsx +++ b/packages/frontend/src/pages/DataUpload.tsx @@ -1,6 +1,6 @@ import { useState, useRef, useEffect } from 'react'; import JSZip from 'jszip'; -import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, parseCSV, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata'; +import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, parseCSV, parseJsonData, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata'; import PageHeader from '../components/PageHeader'; import styles from './DataUpload.module.css'; @@ -173,7 +173,10 @@ const DataUpload: React.FC = ({ const textMap = new Map(); for (const file of files) { - const type = file.name.split('.').pop()?.toLowerCase() || ''; + const rawExt = file.name.split('.').pop()?.toLowerCase() || ''; + // Treat JSON-Lines as JSON: parseJsonData() accepts both a single array and one + // JSON value per line, so .jsonl flows through the same path as .json downstream. + const type = rawExt === 'jsonl' ? 'json' : rawExt; const content = await readFileAsText(file); textMap.set(file.webkitRelativePath || file.name, { content, type }); } @@ -187,7 +190,7 @@ const DataUpload: React.FC = ({ if (type !== 'json') continue; if (name === 'dataset_description.json' || name.endsWith('/dataset_description.json')) continue; try { - const parsed = JSON.parse(content); + const parsed = parseJsonData(content); // single array or JSON-Lines (flattened) if (!Array.isArray(parsed) || parsed.length === 0) continue; const analysis = analyzeJoinKeys(parsed, ['trial_index']); if (!analysis.isUnique) { @@ -268,7 +271,7 @@ const DataUpload: React.FC = ({ let mainRows: Array> = []; let mainContent: string | undefined; if (type === 'json') { - const json = JSON.parse(content); + const json = parseJsonData(content); // single array or JSON-Lines (flattened) if (!Array.isArray(json)) { update(i, { status: 'skipped', detail: 'not a jsPsych trial array' }); continue; diff --git a/packages/frontend/tests/dataUploadConversion.test.ts b/packages/frontend/tests/dataUploadConversion.test.ts index 23f8771..b20adb5 100644 --- a/packages/frontend/tests/dataUploadConversion.test.ts +++ b/packages/frontend/tests/dataUploadConversion.test.ts @@ -1,4 +1,4 @@ -import { parseCSV, buildPsychDSDataFiles, deriveFallbackBase } from "@jspsych/metadata"; +import { parseCSV, parseJsonData, buildPsychDSDataFiles, deriveFallbackBase } from "@jspsych/metadata"; // Mirrors the CSV branch of DataUpload.runGenerate: parse the uploaded CSV into mainRows and // hand it to the shared builder. Guards the frontend wiring (the parseCSV call + builder usage) @@ -28,3 +28,26 @@ describe("frontend CSV → Psych-DS conversion (the runGenerate path)", () => { expect(built.find((f) => f.kind === "main")!.content).toBe(content); }); }); + +// Mirrors the JSON branch of DataUpload.runGenerate for a .jsonl upload: the file's `type` +// is normalised to 'json', then parseJsonData flattens the per-line participant arrays into +// mainRows before the shared builder serialises them to one converted data/*.csv. +describe("frontend JSON-Lines → Psych-DS conversion (the runGenerate path)", () => { + it("flattens a .jsonl export (one participant array per line) into one main CSV", () => { + const p1 = JSON.stringify([{ trial_type: "html-keyboard-response", rt: 450 }]); + const p2 = JSON.stringify([{ trial_type: "html-keyboard-response", rt: 512 }]); + const content = `${p1}\n${p2}\n`; + + const mainRows = parseJsonData(content) as Array>; + expect(mainRows).toHaveLength(2); + + const built = buildPsychDSDataFiles({ base: deriveFallbackBase("raw"), mainRows }); + const main = built.find((f) => f.kind === "main")!; + const lines = main.content.split(/\r?\n/).filter(Boolean); + expect(lines[0].split(",")).toEqual(["trial_type", "rt"]); + // Both participant lines became data rows. + expect(lines).toHaveLength(3); + expect(main.content).toContain("450"); + expect(main.content).toContain("512"); + }); +}); diff --git a/packages/metadata/src/index.ts b/packages/metadata/src/index.ts index 0d64edd..8aea9a7 100644 --- a/packages/metadata/src/index.ts +++ b/packages/metadata/src/index.ts @@ -1,6 +1,6 @@ import { AuthorFields, AuthorsMap } from "./AuthorsMap"; import { PluginCache } from "./PluginCache"; -import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils"; +import { saveTextToFile, parseCSV, tryParseJSON, parseJsonData, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils"; import { VariableFields, VariablesMap } from "./VariablesMap"; /** @@ -439,7 +439,9 @@ export default class JsPsychMetadata { } if (ext === 'json') { - parsed_data = JSON.parse(data); + // Accepts both a single JSON array (standard jsPsych export) and JSON-Lines, + // where each line is its own JSON value (JATOS exports one participant array per line). + parsed_data = parseJsonData(data); } if (!Array.isArray(parsed_data)) { @@ -1047,5 +1049,5 @@ export { AuthorFields, VariableFields } -export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils"; +export { analyzeJoinKeys, parseCSV, parseJsonData, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils"; export type { JoinKeyAnalysis, PsychDSDataFile, BuildPsychDSDataFilesArgs } from "./utils"; diff --git a/packages/metadata/src/utils.ts b/packages/metadata/src/utils.ts index 279cf15..73f5f28 100644 --- a/packages/metadata/src/utils.ts +++ b/packages/metadata/src/utils.ts @@ -78,6 +78,50 @@ export function tryParseJSON(value: string): any | null { } } +/** + * Parses experiment data that is either a single JSON document (the standard jsPsych + * export — one array of trials, possibly pretty-printed) or JSON-Lines: one JSON value + * per line, as JATOS and several labs export it (typically one participant's trial + * array per line). Returns a flat array of observations in both cases. + * + * A well-formed single document is returned as-is (arrays untouched, so existing + * single-array callers see no change). Only when whole-string parsing fails do we fall + * back to line-by-line parsing, flattening any per-line arrays into one observation + * stream. Throws a descriptive error when the input is neither valid JSON nor valid JSONL. + */ +export function parseJsonData(content: string): any { + // Fast path: a single, well-formed JSON document. Covers the standard single array + // (including pretty-printed/multi-line) with no behaviour change for existing callers. + const whole = tryParseJSON(content); + if (whole !== null) return whole; + + // Fallback: JSON-Lines. Each non-empty line must be its own JSON value; per-line + // arrays are concatenated so a multi-participant export becomes one observation array. + const lines = content.split(/\r?\n/); + const out: any[] = []; + let parsedAny = false; + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + if (!line) continue; + let value; + try { + value = JSON.parse(line); + } catch { + throw new Error( + `Could not parse data as JSON or JSON-Lines: line ${i + 1} is not valid JSON.` + ); + } + parsedAny = true; + if (Array.isArray(value)) out.push(...value); + else out.push(value); + } + + if (!parsedAny) { + throw new Error("Could not parse data: input is empty or not valid JSON/JSON-Lines."); + } + return out; +} + /** System columns excluded from join-key candidate detection; also used to initialise ignored_variables in JsPsychMetadata. */ export const SYSTEM_COLUMNS = new Set([ 'trial_type', 'trial_index', 'time_elapsed', 'extension_type', 'extension_version', diff --git a/packages/metadata/tests/jsonl-ingestion.test.ts b/packages/metadata/tests/jsonl-ingestion.test.ts new file mode 100644 index 0000000..5650bd1 --- /dev/null +++ b/packages/metadata/tests/jsonl-ingestion.test.ts @@ -0,0 +1,74 @@ +import JsPsychMetadata from "../src/index"; +import { parseJsonData } from "../src/utils"; + +// JSON-Lines ingestion: several jsPsych labs (and JATOS) export experiment data as +// newline-delimited JSON — one JSON value per line, typically one participant's full +// trial array per line — rather than a single JSON array. generate() / parseJsonData +// must accept both forms and flatten JSONL into one observation stream. + +describe("parseJsonData", () => { + test("returns a standard single JSON array unchanged", () => { + const rows = [{ a: 1 }, { a: 2 }]; + expect(parseJsonData(JSON.stringify(rows))).toEqual(rows); + }); + + test("parses a pretty-printed (multi-line) single array", () => { + const rows = [{ a: 1 }, { a: 2 }]; + expect(parseJsonData(JSON.stringify(rows, null, 2))).toEqual(rows); + }); + + test("flattens JSON-Lines where each line is a participant array", () => { + const p1 = [{ subject: 1, t: 0 }, { subject: 1, t: 1 }]; + const p2 = [{ subject: 2, t: 0 }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}\n`; + expect(parseJsonData(jsonl)).toEqual([...p1, ...p2]); + }); + + test("handles JSON-Lines where each line is a single object", () => { + const jsonl = `{"a":1}\n{"a":2}\n{"a":3}`; + expect(parseJsonData(jsonl)).toEqual([{ a: 1 }, { a: 2 }, { a: 3 }]); + }); + + test("ignores blank lines (incl. CRLF) between records", () => { + const jsonl = `[{"a":1}]\r\n\r\n[{"a":2}]\r\n`; + expect(parseJsonData(jsonl)).toEqual([{ a: 1 }, { a: 2 }]); + }); + + test("throws a descriptive error for a malformed JSONL line", () => { + const jsonl = `[{"a":1}]\nnot json\n[{"a":2}]`; + expect(() => parseJsonData(jsonl)).toThrow(/line 2 is not valid JSON/); + }); + + test("throws for empty input", () => { + expect(() => parseJsonData(" \n ")).toThrow(/empty or not valid/); + }); +}); + +describe("generate() ingests JSON-Lines end to end", () => { + beforeEach(() => { + (global as any).fetch = jest.fn().mockResolvedValue({ text: () => Promise.resolve("") }); + }); + + test("builds variableMeasured from a multi-line (per-participant) JSONL export", async () => { + const p1 = [ + { trial_type: "html-keyboard-response", trial_index: 0, rt: 500, subject: "a" }, + { trial_type: "html-keyboard-response", trial_index: 1, rt: 650, subject: "a" }, + ]; + const p2 = [ + { trial_type: "html-keyboard-response", trial_index: 0, rt: 720, subject: "b" }, + ]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + + const meta = new JsPsychMetadata(); + await meta.generate(jsonl, {}, "json"); + + const names = meta.getMetadata().variableMeasured.map((v: any) => v.name); + // A non-system column from the flattened rows is captured... + expect(names).toContain("rt"); + expect(names).toContain("subject"); + // ...and rt's range spans rows drawn from both participant lines. + const rt = meta.getVariable("rt") as any; + expect(rt.minValue).toBe(500); + expect(rt.maxValue).toBe(720); + }); +}); From 721af757e6a86067a62b5358df47265274e777d5 Mon Sep 17 00:00:00 2001 From: Mandyx22 <1915537307@qq.com> Date: Wed, 17 Jun 2026 14:05:53 -0400 Subject: [PATCH 2/3] feat(metadata,cli,frontend): synthesize participant_id join key for JSONL Multi-participant JSON-Lines exports carry no per-row participant id, so after flattening, trial_index repeats across participants and can't uniquely key the extracted array/object sidecars. parseJsonData now opt-in tags each line's rows with a 0-based participant_id (reporting whether it synthesized one), and generate() promotes it to the leading join key for JSON input so sidecars join unambiguously. When the id is actually synthesized it gets an explicit "not a real subject ID" description (also avoiding an empty {} that trips OBJECT_TYPE_MISSING); a pre-existing participant_id is left untouched. The CLI pre-analysis/prompt and frontend pre-flight mirror the promotion so multi-participant JSONL isn't falsely flagged as non-unique. Co-Authored-By: Claude Opus 4.8 --- .changeset/jsonl-participant-id.md | 13 +++ packages/cli/src/data.ts | 27 ++++-- packages/cli/src/index.ts | 14 +-- packages/frontend/src/pages/DataUpload.tsx | 15 ++- packages/metadata/src/index.ts | 39 +++++++- packages/metadata/src/utils.ts | 33 ++++++- .../metadata/tests/jsonl-ingestion.test.ts | 91 +++++++++++++++++++ 7 files changed, 212 insertions(+), 20 deletions(-) create mode 100644 .changeset/jsonl-participant-id.md diff --git a/.changeset/jsonl-participant-id.md b/.changeset/jsonl-participant-id.md new file mode 100644 index 0000000..c99dd40 --- /dev/null +++ b/.changeset/jsonl-participant-id.md @@ -0,0 +1,13 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +"frontend": patch +--- + +Synthesize a `participant_id` join key for multi-participant JSON-Lines exports. Raw jsPsych exports carry no per-row participant identifier, so once JSONL is flattened (one participant per line) `trial_index` repeats across participants and can't uniquely key the extracted array/object sidecar CSVs — every participant's trial 0 collapsed onto the same `(trial_index, element_index)` key, making the sidecars impossible to join back to a single parent trial. + +`parseJsonData` now takes an opt-in `{ tagParticipantId }` flag: in the JSON-Lines path it stamps each line's object rows with a 0-based `participant_id` (a no-op on the single-array fast path; never overwrites an existing value), and reports via an optional `stats` out-param whether it actually synthesized the id. `generate()` enables this for JSON input and promotes `participant_id` to the leading join key (`['participant_id', 'trial_index']`) whenever rows carry one, so the sidecars join unambiguously. CSV inputs are unaffected. + +When — and only when — the id was actually synthesized (i.e. absent from the source), it is given an explicit description that makes its synthetic origin unmistakable ("Synthetic participant identifier … NOT a real subject ID from the experiment …") so a downstream user can't mistake it for a real subject ID; this also avoids serializing an empty `{}` description (an object with no `@type`, which trips the validator's `OBJECT_TYPE_MISSING`). A `participant_id` already present in the data is left untouched. The CLI's join-key pre-analysis/prompt and the frontend's pre-flight mirror this promotion so multi-participant JSONL is no longer falsely flagged as having a non-unique join key. + +Verified end to end against the raw `.jsonl` exports in `githubpsyche/homophily`: all three files generate metadata, pass the Psych-DS validator, and write sidecars whose `(participant_id, trial_index, element_index)` keys are fully unique (e.g. `view_history` at 385/385 rows, vs. 7 colliding keys without `participant_id`). diff --git a/packages/cli/src/data.ts b/packages/cli/src/data.ts index fafd347..6546897 100644 --- a/packages/cli/src/data.ts +++ b/packages/cli/src/data.ts @@ -102,14 +102,14 @@ async function collectDataFiles( export async function preAnalyzeDirectory( directoryPath: string, initialKeys: string[] = ['trial_index'] -): Promise<{ parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string } | null> { +): Promise<{ parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string; keys: string[] } | null> { directoryPath = expandHomeDir(directoryPath); const collected = await collectDataFiles(directoryPath); if (!collected) return null; const { files: filePaths } = collected; - let worst: { parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string } | null = null; + let worst: { parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string; keys: string[] } | null = null; for (const { filePath, name } of filePaths) { if (name === 'dataset_description.json') continue; @@ -122,16 +122,27 @@ export async function preAnalyzeDirectory( let parsedData: Array>; if (isJsonDataExt(ext)) { - const raw = parseJsonData(content); // single array or JSON-Lines + // Tag a per-line participant_id for JSON-Lines (a no-op for a single array) so the + // analysis below sees the same rows generate() will. + const raw = parseJsonData(content, { tagParticipantId: true }); if (!Array.isArray(raw)) continue; parsedData = raw as Array>; } else { parsedData = (await parseCSV(content)) as Array>; } - const analysis = analyzeJoinKeys(parsedData, initialKeys); + // Mirror generate()'s join-key promotion so the prompt is built from the keys generate() + // will actually use: a participant_id synthesized from JSON-Lines (or present in the + // export) becomes the leading join key, and the uniqueness check accounts for it. + const keys = (isJsonDataExt(ext) && + !initialKeys.includes('participant_id') && + parsedData.some((row) => row && typeof row === 'object' && 'participant_id' in row)) + ? ['participant_id', ...initialKeys] + : initialKeys; + + const analysis = analyzeJoinKeys(parsedData, keys); if (!analysis.isUnique && (worst === null || analysis.duplicateCount > worst.analysis.duplicateCount)) { - worst = { parsedData, analysis, fileName: name }; + worst = { parsedData, analysis, fileName: name, keys }; } } catch { continue; @@ -264,7 +275,7 @@ export async function analyzeOutputColumns( continue; } if (isJsonDataExt(ext)) { - if (!Array.isArray(parseJsonData(content))) continue; // non-array JSON is skipped by the writer too + if (!Array.isArray(parseJsonData(content, { tagParticipantId: true }))) continue; // non-array JSON is skipped by the writer too await metadata.generate(content, {}, 'json', options); } else { await metadata.generate(content, {}, 'csv', options); @@ -351,7 +362,9 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil // a later valid file that maps to the same base. let parsed: Array> | null = null; if (isJsonDataExt(fileExtension)) { - const json = parseJsonData(content); // single array or JSON-Lines (flattened) + // Tag a per-line participant_id for JSON-Lines (a no-op for a single array) so the main + // CSV carries the same join-key column generate() promotes for the sidecars. + const json = parseJsonData(content, { tagParticipantId: true }); if (!Array.isArray(json)) { console.error(`"${file}" is not a JSON array of jsPsych trials; skipping CSV conversion.`); return false; diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index b44a190..4c7bc54 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -730,17 +730,19 @@ const main = async () => { const canPrompt = !isNonInteractive && !!process.stdin.isTTY && !!process.stdout.isTTY; const { bases: normalizedBases, plan: renamePlan } = await resolveFilenameNormalization(dataDir, canPrompt, outputColumns); - // Pre-flight: check whether default join key (trial_index) is unique. If not, prompt the user - // when we have a terminal; otherwise (fully-flagged headless run) resolve deterministically so - // the run never blocks on an interactive prompt it can't answer. + // Pre-flight: check whether the join key is unique. preAnalyzeDirectory mirrors generate()'s + // participant_id promotion, so preResult.keys is the effective key set (e.g. + // ['participant_id', 'trial_index'] for multi-participant JSON-Lines) — use it as the basis for + // resolution. If not unique, prompt the user when we have a terminal; otherwise (headless run) + // resolve deterministically so the run never blocks on an interactive prompt it can't answer. const initialKeys = ['trial_index']; const preResult = await preAnalyzeDirectory(dataDir, initialKeys); - let arrayJoinKeys = initialKeys; + let arrayJoinKeys = preResult?.keys ?? initialKeys; if (preResult && !preResult.analysis.isUnique) { if (canPrompt) { - arrayJoinKeys = await promptJoinKeys(preResult.parsedData, preResult.analysis, initialKeys, preResult.fileName); + arrayJoinKeys = await promptJoinKeys(preResult.parsedData, preResult.analysis, preResult.keys, preResult.fileName); } else { - const resolved = resolveJoinKeysNonInteractive(preResult.analysis, initialKeys, preResult.fileName); + const resolved = resolveJoinKeysNonInteractive(preResult.analysis, preResult.keys, preResult.fileName); arrayJoinKeys = resolved.keys; (resolved.unresolved ? console.warn : console.log)(`${resolved.unresolved ? '⚠' : 'ℹ'} ${resolved.message}`); } diff --git a/packages/frontend/src/pages/DataUpload.tsx b/packages/frontend/src/pages/DataUpload.tsx index dde274a..c3ab049 100644 --- a/packages/frontend/src/pages/DataUpload.tsx +++ b/packages/frontend/src/pages/DataUpload.tsx @@ -190,9 +190,16 @@ const DataUpload: React.FC = ({ if (type !== 'json') continue; if (name === 'dataset_description.json' || name.endsWith('/dataset_description.json')) continue; try { - const parsed = parseJsonData(content); // single array or JSON-Lines (flattened) + // Tag a per-line participant_id for JSON-Lines (a no-op for a single array). + const parsed = parseJsonData(content, { tagParticipantId: true }); if (!Array.isArray(parsed) || parsed.length === 0) continue; - const analysis = analyzeJoinKeys(parsed, ['trial_index']); + // Mirror generate()'s join-key promotion so a multi-participant JSON-Lines file isn't + // wrongly flagged: trial_index alone repeats across participants, but a synthesized + // participant_id makes (participant_id, trial_index) unique. + const keys = parsed.some((row: any) => row && typeof row === 'object' && 'participant_id' in row) + ? ['participant_id', 'trial_index'] + : ['trial_index']; + const analysis = analyzeJoinKeys(parsed, keys); if (!analysis.isUnique) { setJoinKeyProblemFile(name); setJoinKeyCandidates(analysis.candidates); @@ -271,7 +278,9 @@ const DataUpload: React.FC = ({ let mainRows: Array> = []; let mainContent: string | undefined; if (type === 'json') { - const json = parseJsonData(content); // single array or JSON-Lines (flattened) + // Tag a per-line participant_id for JSON-Lines (a no-op for a single array) so the + // main CSV carries the same join-key column generate() promotes for the sidecars. + const json = parseJsonData(content, { tagParticipantId: true }); if (!Array.isArray(json)) { update(i, { status: 'skipped', detail: 'not a jsPsych trial array' }); continue; diff --git a/packages/metadata/src/index.ts b/packages/metadata/src/index.ts index 8aea9a7..e9236b7 100644 --- a/packages/metadata/src/index.ts +++ b/packages/metadata/src/index.ts @@ -438,10 +438,18 @@ export default class JsPsychMetadata { parsed_data = await parseCSV(data); } + let synthesizedParticipantId = false; if (ext === 'json') { // Accepts both a single JSON array (standard jsPsych export) and JSON-Lines, // where each line is its own JSON value (JATOS exports one participant array per line). - parsed_data = parseJsonData(data); + // Tag JSON-Lines rows with a per-line participant_id: raw jsPsych exports carry no + // per-row participant identifier, so in multi-participant JSONL trial_index alone + // repeats across participants and can't uniquely key the extracted sidecar CSVs. + // The stat records whether we actually invented the id (vs. it already being present), + // so we only describe it as synthetic when it truly is. + const parseStats: { synthesizedParticipantId?: boolean } = {}; + parsed_data = parseJsonData(data, { tagParticipantId: true }, parseStats); + synthesizedParticipantId = parseStats.synthesizedParticipantId === true; } if (!Array.isArray(parsed_data)) { @@ -461,6 +469,19 @@ export default class JsPsychMetadata { ); } + // When JSON rows carry a participant_id — synthesized per line from JSON-Lines above, or + // already present in the export — promote it to the leading join key (unless the caller + // already listed it). Raw jsPsych exports otherwise have no per-row participant identifier, + // so trial_index alone repeats across participants and can't uniquely key the extracted + // sidecar CSVs; (participant_id, trial_index, …) restores a one-trial-per-key join. CSV + // inputs are left untouched, preserving existing behaviour for tabular sources. + const hasParticipantId = ext === 'json' && + (parsed_data as Array>).some( + (row) => row && typeof row === 'object' && 'participant_id' in row); + if (hasParticipantId && !this.arrayJoinKeys.includes('participant_id')) { + this.arrayJoinKeys = ['participant_id', ...this.arrayJoinKeys]; + } + // Callers that already surface join-key uniqueness to the user (e.g. the CLI's // interactive pre-analysis prompt) can suppress this warning to avoid repeating it // once per file. @@ -471,6 +492,22 @@ export default class JsPsychMetadata { await this.generateObservation(observation); } + // Only when WE synthesized participant_id (it wasn't in the source) do we own its + // description. As an identifier/join-key column it isn't plugin-documented, so per-trial + // processing leaves it with only "unknown" plugin descriptions that getList() strips to an + // empty {} (an object with no @type → OBJECT_TYPE_MISSING). Give it one explicit + // description that makes its synthetic origin unmistakable, so a downstream user never + // mistakes it for a real subject ID. A pre-existing participant_id is left untouched — its + // meaning is the experiment's, not ours. Done before updateMetadata so a caller-supplied + // metadata override still wins. + if (synthesizedParticipantId && this.containsVariable('participant_id')) { + const existing = this.getVariable('participant_id') as VariableFields; + this.setVariable({ + ...existing, + description: { default: 'Synthetic participant identifier (0-based), assigned one per source record (one participant per JSON-Lines line) because the raw data carried no participant column. NOT a real subject ID from the experiment — it only orders/links records as they appeared in the source file, and serves as a join key connecting each trial to its extracted array/object rows.' }, + }); + } + await this.updateMetadata(metadata); } diff --git a/packages/metadata/src/utils.ts b/packages/metadata/src/utils.ts index 73f5f28..b77c8ea 100644 --- a/packages/metadata/src/utils.ts +++ b/packages/metadata/src/utils.ts @@ -88,10 +88,21 @@ export function tryParseJSON(value: string): any | null { * single-array callers see no change). Only when whole-string parsing fails do we fall * back to line-by-line parsing, flattening any per-line arrays into one observation * stream. Throws a descriptive error when the input is neither valid JSON nor valid JSONL. + * + * When `tagParticipantId` is set, `stats.synthesizedParticipantId` is set to true iff a + * participant_id was actually stamped onto at least one row (i.e. the data did not already + * carry one). Callers use this to describe the column honestly — a synthesized id is not a + * real subject identifier and must not be presented as one. */ -export function parseJsonData(content: string): any { +export function parseJsonData( + content: string, + options: { tagParticipantId?: boolean } = {}, + stats?: { synthesizedParticipantId?: boolean } +): any { // Fast path: a single, well-formed JSON document. Covers the standard single array // (including pretty-printed/multi-line) with no behaviour change for existing callers. + // Note: tagParticipantId never applies here — a single document has no line boundaries + // to identify participants by, so its rows are returned untouched. const whole = tryParseJSON(content); if (whole !== null) return whole; @@ -100,6 +111,7 @@ export function parseJsonData(content: string): any { const lines = content.split(/\r?\n/); const out: any[] = []; let parsedAny = false; + let participantIndex = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (!line) continue; @@ -112,8 +124,23 @@ export function parseJsonData(content: string): any { ); } parsedAny = true; - if (Array.isArray(value)) out.push(...value); - else out.push(value); + const observations = Array.isArray(value) ? value : [value]; + // In JSON-Lines each line is one participant's submission (JATOS-style export). The line + // boundary is the only participant identifier these raw jsPsych exports carry, so — when + // asked — stamp every object observation from this line with a 0-based participant_id + // before that boundary is lost in the flattened stream. This lets nested array/object + // extraction form a unique (participant_id, trial_index) join key. Existing participant_id + // values are left untouched; non-object lines (bare primitives) can't carry the tag. + if (options.tagParticipantId) { + for (const obs of observations) { + if (obs !== null && typeof obs === "object" && !Array.isArray(obs) && !("participant_id" in obs)) { + obs.participant_id = participantIndex; + if (stats) stats.synthesizedParticipantId = true; + } + } + } + out.push(...observations); + participantIndex++; } if (!parsedAny) { diff --git a/packages/metadata/tests/jsonl-ingestion.test.ts b/packages/metadata/tests/jsonl-ingestion.test.ts index 5650bd1..772acd0 100644 --- a/packages/metadata/tests/jsonl-ingestion.test.ts +++ b/packages/metadata/tests/jsonl-ingestion.test.ts @@ -44,6 +44,41 @@ describe("parseJsonData", () => { }); }); +describe("parseJsonData participant_id tagging", () => { + // Raw jsPsych exports carry no per-row participant identifier; for multi-participant + // JSON-Lines the line boundary is the only one available, so tagParticipantId stamps a + // 0-based participant_id per line. This lets nested array/object extraction form a unique + // (participant_id, trial_index) join key. + test("tags a per-line participant_id across JSON-Lines records", () => { + const p1 = [{ trial_index: 0 }, { trial_index: 1 }]; + const p2 = [{ trial_index: 0 }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + expect(parseJsonData(jsonl, { tagParticipantId: true })).toEqual([ + { trial_index: 0, participant_id: 0 }, + { trial_index: 1, participant_id: 0 }, + { trial_index: 0, participant_id: 1 }, + ]); + }); + + test("leaves a single JSON array untouched (no line boundaries to tag by)", () => { + const rows = [{ trial_index: 0 }, { trial_index: 1 }]; + expect(parseJsonData(JSON.stringify(rows), { tagParticipantId: true })).toEqual(rows); + }); + + test("does not overwrite an existing participant_id", () => { + const jsonl = `[{"participant_id":"P7","trial_index":0}]\n[{"trial_index":0}]`; + expect(parseJsonData(jsonl, { tagParticipantId: true })).toEqual([ + { participant_id: "P7", trial_index: 0 }, + { trial_index: 0, participant_id: 1 }, + ]); + }); + + test("does not tag when the option is off (default)", () => { + const jsonl = `[{"trial_index":0}]\n[{"trial_index":0}]`; + expect(parseJsonData(jsonl)).toEqual([{ trial_index: 0 }, { trial_index: 0 }]); + }); +}); + describe("generate() ingests JSON-Lines end to end", () => { beforeEach(() => { (global as any).fetch = jest.fn().mockResolvedValue({ text: () => Promise.resolve("") }); @@ -71,4 +106,60 @@ describe("generate() ingests JSON-Lines end to end", () => { expect(rt.minValue).toBe(500); expect(rt.maxValue).toBe(720); }); + + test("synthesizes participant_id so multi-participant JSON-Lines sidecars join uniquely", async () => { + // Both participants restart trial_index at 0, so without a participant identifier the two + // trial-0 view_history rows would collide on (trial_index, element_index). + const p1 = [{ trial_type: "html-keyboard-response", trial_index: 0, view_history: [{ page: 0 }, { page: 1 }] }]; + const p2 = [{ trial_type: "html-keyboard-response", trial_index: 0, view_history: [{ page: 0 }] }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + + const meta = new JsPsychMetadata(); + await meta.generate(jsonl, {}, "json"); + + // participant_id is promoted to the leading join key. + expect(meta.getArrayJoinKeys()).toEqual(["participant_id", "trial_index"]); + + // It serialises with a plain-text description (not an empty {} that would trip + // Psych-DS's OBJECT_TYPE_MISSING) that makes its synthetic origin unmistakable, so a + // downstream user can't mistake it for a real subject ID. + const pid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "participant_id"); + expect(typeof pid.description).toBe("string"); + expect(pid.description.toLowerCase()).toContain("synthetic"); + expect(pid.description.toLowerCase()).toContain("not a real subject id"); + + // Every extracted view_history row carries participant_id, so the composite key is unique. + const rows = meta.getExtractedArrays().get("view_history") as Array>; + expect(rows.length).toBe(3); + const keyset = new Set(rows.map((r) => `${r.participant_id}|${r.trial_index}|${r.element_index}`)); + expect(keyset.size).toBe(rows.length); + expect(rows.map((r) => r.participant_id).sort()).toEqual([0, 0, 1]); + }); + + test("does not relabel a real participant_id already present in the data", async () => { + // Each line already carries its own participant_id — a real identifier. Promotion should + // still use it as a join key, but we must not overwrite it with a "synthetic" description + // (that would misrepresent a genuine subject ID). + const p1 = [{ trial_type: "html-keyboard-response", trial_index: 0, participant_id: "sub-007" }]; + const p2 = [{ trial_type: "html-keyboard-response", trial_index: 0, participant_id: "sub-008" }]; + const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; + + const meta = new JsPsychMetadata(); + await meta.generate(jsonl, {}, "json"); + + expect(meta.getArrayJoinKeys()).toEqual(["participant_id", "trial_index"]); + const pid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "participant_id"); + const desc = typeof pid.description === "string" ? pid.description : JSON.stringify(pid.description); + expect(desc.toLowerCase()).not.toContain("synthetic"); + }); + + test("does not promote participant_id for a single-array export that lacks one", async () => { + const rows = [ + { trial_type: "html-keyboard-response", trial_index: 0, rt: 1 }, + { trial_type: "html-keyboard-response", trial_index: 1, rt: 2 }, + ]; + const meta = new JsPsychMetadata(); + await meta.generate(JSON.stringify(rows), {}, "json"); + expect(meta.getArrayJoinKeys()).toEqual(["trial_index"]); + }); }); From 5a905a9bdef2dc7bd78b6c94671773b32bc8336c Mon Sep 17 00:00:00 2001 From: Mandyx22 <1915537307@qq.com> Date: Wed, 17 Jun 2026 15:10:36 -0400 Subject: [PATCH 3/3] refactor(metadata,cli,frontend): rename synthesized join key to source_record_id The per-line join key synthesized for JSON-Lines input was named participant_id, which overclaims: a JSONL line is only guaranteed to be one source record, not necessarily one participant. Rename it to source_record_id across the library, CLI, and frontend. - parseJsonData: option tagParticipantId -> tagSourceRecordId, stat synthesizedParticipantId -> synthesizedSourceRecordId; stamps source_record_id. Synthesis now defers to a real participant_id (or an existing source_record_id) already in the data, so a genuine subject id is never duplicated or mislabeled. - generate(): promotes the identifier as the leading join key, preferring the synthesized source_record_id and falling back to a real participant_id. The synthetic-origin description now describes a "source record" (usually but not always one participant). - CLI: emits a one-line info log when it adds the column ("Detected JSON-Lines input; added synthetic source_record_id ..."), surfaced via a new optional out-param on preAnalyzeDirectory (no extra parse pass, return contract unchanged). data.ts/index.ts pre-analysis mirror the new id. - frontend: pre-flight + builder use source_record_id. - Tests + changeset updated. Co-Authored-By: Claude Opus 4.8 --- .changeset/jsonl-participant-id.md | 13 ---- .changeset/jsonl-source-record-id.md | 15 ++++ packages/cli/src/data.ts | 34 +++++---- packages/cli/src/index.ts | 16 ++++- packages/cli/tests/data.test.ts | 23 ++++++ packages/frontend/src/pages/DataUpload.tsx | 20 +++--- packages/metadata/src/index.ts | 54 +++++++------- packages/metadata/src/utils.ts | 44 ++++++------ .../metadata/tests/jsonl-ingestion.test.ts | 71 +++++++++++-------- 9 files changed, 176 insertions(+), 114 deletions(-) delete mode 100644 .changeset/jsonl-participant-id.md create mode 100644 .changeset/jsonl-source-record-id.md diff --git a/.changeset/jsonl-participant-id.md b/.changeset/jsonl-participant-id.md deleted file mode 100644 index c99dd40..0000000 --- a/.changeset/jsonl-participant-id.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -"@jspsych/metadata": patch -"@jspsych/metadata-cli": patch -"frontend": patch ---- - -Synthesize a `participant_id` join key for multi-participant JSON-Lines exports. Raw jsPsych exports carry no per-row participant identifier, so once JSONL is flattened (one participant per line) `trial_index` repeats across participants and can't uniquely key the extracted array/object sidecar CSVs — every participant's trial 0 collapsed onto the same `(trial_index, element_index)` key, making the sidecars impossible to join back to a single parent trial. - -`parseJsonData` now takes an opt-in `{ tagParticipantId }` flag: in the JSON-Lines path it stamps each line's object rows with a 0-based `participant_id` (a no-op on the single-array fast path; never overwrites an existing value), and reports via an optional `stats` out-param whether it actually synthesized the id. `generate()` enables this for JSON input and promotes `participant_id` to the leading join key (`['participant_id', 'trial_index']`) whenever rows carry one, so the sidecars join unambiguously. CSV inputs are unaffected. - -When — and only when — the id was actually synthesized (i.e. absent from the source), it is given an explicit description that makes its synthetic origin unmistakable ("Synthetic participant identifier … NOT a real subject ID from the experiment …") so a downstream user can't mistake it for a real subject ID; this also avoids serializing an empty `{}` description (an object with no `@type`, which trips the validator's `OBJECT_TYPE_MISSING`). A `participant_id` already present in the data is left untouched. The CLI's join-key pre-analysis/prompt and the frontend's pre-flight mirror this promotion so multi-participant JSONL is no longer falsely flagged as having a non-unique join key. - -Verified end to end against the raw `.jsonl` exports in `githubpsyche/homophily`: all three files generate metadata, pass the Psych-DS validator, and write sidecars whose `(participant_id, trial_index, element_index)` keys are fully unique (e.g. `view_history` at 385/385 rows, vs. 7 colliding keys without `participant_id`). diff --git a/.changeset/jsonl-source-record-id.md b/.changeset/jsonl-source-record-id.md new file mode 100644 index 0000000..abc77b3 --- /dev/null +++ b/.changeset/jsonl-source-record-id.md @@ -0,0 +1,15 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +"frontend": patch +--- + +Synthesize a `source_record_id` join key for multi-record JSON-Lines exports. Raw jsPsych exports carry no per-row identifier, so once JSONL is flattened (one record per line) `trial_index` repeats across records and can't uniquely key the extracted array/object sidecar CSVs — every record's trial 0 collapsed onto the same `(trial_index, element_index)` key, making the sidecars impossible to join back to a single parent trial. + +The synthesized column is named `source_record_id` rather than `participant_id` because a JSON-Lines line is only guaranteed to be one *source record* — usually, but not always, one participant. The honest name avoids overclaiming for exports where a line isn't a single subject. + +`parseJsonData` now takes an opt-in `{ tagSourceRecordId }` flag: in the JSON-Lines path it stamps each line's object rows with a 0-based `source_record_id` (a no-op on the single-array fast path), and reports via an optional `stats` out-param whether it actually synthesized the id. A line that already carries a `source_record_id` or a real `participant_id` is left untouched — the experiment's own identifier already groups those rows. `generate()` enables this for JSON input and promotes the identifier to the leading join key, preferring the synthesized `source_record_id` and falling back to a real `participant_id` already present in the export (`['source_record_id', 'trial_index']` or `['participant_id', 'trial_index']`), so the sidecars join unambiguously. CSV inputs are unaffected. + +When — and only when — the id was actually synthesized (i.e. absent from the source), it is given an explicit description that makes its synthetic origin unmistakable ("Synthetic source-record identifier … NOT a real subject ID from the experiment …") so a downstream user can't mistake it for a real subject ID; this also avoids serializing an empty `{}` description (an object with no `@type`, which trips the validator's `OBJECT_TYPE_MISSING`). The CLI's join-key pre-analysis/prompt and the frontend's pre-flight mirror this promotion so multi-record JSONL is no longer falsely flagged as having a non-unique join key. + +Verified end to end against the raw `.jsonl` exports in `vucml/online_experiments` (`block_cat`): the combined 30-record export generates metadata, passes the Psych-DS validator (0 errors), synthesizes `source_record_id` 0–29, and writes sidecars whose `(source_record_id, trial_index, element_index)` keys are fully unique — including the doubly-nested `recall_responses` case. Notably `subjectId` collides across the two merged datasets (two records share `601`), which `source_record_id` correctly keeps distinct. diff --git a/packages/cli/src/data.ts b/packages/cli/src/data.ts index 6546897..0b557b7 100644 --- a/packages/cli/src/data.ts +++ b/packages/cli/src/data.ts @@ -101,7 +101,11 @@ async function collectDataFiles( */ export async function preAnalyzeDirectory( directoryPath: string, - initialKeys: string[] = ['trial_index'] + initialKeys: string[] = ['trial_index'], + // Optional out-param: set to true if any JSON-Lines file gets a synthesized source_record_id. + // Surfaced this way (rather than via the return value) so the existing return contract — and + // its "no problem found → null" callers — stays unchanged. + outStats?: { synthesizedSourceRecordId?: boolean } ): Promise<{ parsedData: Array>; analysis: JoinKeyAnalysis; fileName: string; keys: string[] } | null> { directoryPath = expandHomeDir(directoryPath); @@ -122,23 +126,27 @@ export async function preAnalyzeDirectory( let parsedData: Array>; if (isJsonDataExt(ext)) { - // Tag a per-line participant_id for JSON-Lines (a no-op for a single array) so the + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array) so the // analysis below sees the same rows generate() will. - const raw = parseJsonData(content, { tagParticipantId: true }); + const stats: { synthesizedSourceRecordId?: boolean } = {}; + const raw = parseJsonData(content, { tagSourceRecordId: true }, stats); if (!Array.isArray(raw)) continue; + if (stats.synthesizedSourceRecordId && outStats) outStats.synthesizedSourceRecordId = true; parsedData = raw as Array>; } else { parsedData = (await parseCSV(content)) as Array>; } // Mirror generate()'s join-key promotion so the prompt is built from the keys generate() - // will actually use: a participant_id synthesized from JSON-Lines (or present in the - // export) becomes the leading join key, and the uniqueness check accounts for it. - const keys = (isJsonDataExt(ext) && - !initialKeys.includes('participant_id') && - parsedData.some((row) => row && typeof row === 'object' && 'participant_id' in row)) - ? ['participant_id', ...initialKeys] - : initialKeys; + // will actually use: an identifier column — source_record_id synthesized from JSON-Lines, + // else a real participant_id already in the export — becomes the leading join key, and the + // uniqueness check accounts for it. + const idColumn = isJsonDataExt(ext) + ? (['source_record_id', 'participant_id'] as const).find((col) => + !initialKeys.includes(col) && + parsedData.some((row) => row && typeof row === 'object' && col in row)) + : undefined; + const keys = idColumn ? [idColumn, ...initialKeys] : initialKeys; const analysis = analyzeJoinKeys(parsedData, keys); if (!analysis.isUnique && (worst === null || analysis.duplicateCount > worst.analysis.duplicateCount)) { @@ -275,7 +283,7 @@ export async function analyzeOutputColumns( continue; } if (isJsonDataExt(ext)) { - if (!Array.isArray(parseJsonData(content, { tagParticipantId: true }))) continue; // non-array JSON is skipped by the writer too + if (!Array.isArray(parseJsonData(content, { tagSourceRecordId: true }))) continue; // non-array JSON is skipped by the writer too await metadata.generate(content, {}, 'json', options); } else { await metadata.generate(content, {}, 'csv', options); @@ -362,9 +370,9 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil // a later valid file that maps to the same base. let parsed: Array> | null = null; if (isJsonDataExt(fileExtension)) { - // Tag a per-line participant_id for JSON-Lines (a no-op for a single array) so the main + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array) so the main // CSV carries the same join-key column generate() promotes for the sidecars. - const json = parseJsonData(content, { tagParticipantId: true }); + const json = parseJsonData(content, { tagSourceRecordId: true }); if (!Array.isArray(json)) { console.error(`"${file}" is not a JSON array of jsPsych trials; skipping CSV conversion.`); return false; diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 4c7bc54..c89491c 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -731,12 +731,13 @@ const main = async () => { const { bases: normalizedBases, plan: renamePlan } = await resolveFilenameNormalization(dataDir, canPrompt, outputColumns); // Pre-flight: check whether the join key is unique. preAnalyzeDirectory mirrors generate()'s - // participant_id promotion, so preResult.keys is the effective key set (e.g. - // ['participant_id', 'trial_index'] for multi-participant JSON-Lines) — use it as the basis for + // source_record_id promotion, so preResult.keys is the effective key set (e.g. + // ['source_record_id', 'trial_index'] for multi-record JSON-Lines) — use it as the basis for // resolution. If not unique, prompt the user when we have a terminal; otherwise (headless run) // resolve deterministically so the run never blocks on an interactive prompt it can't answer. const initialKeys = ['trial_index']; - const preResult = await preAnalyzeDirectory(dataDir, initialKeys); + const preStats: { synthesizedSourceRecordId?: boolean } = {}; + const preResult = await preAnalyzeDirectory(dataDir, initialKeys, preStats); let arrayJoinKeys = preResult?.keys ?? initialKeys; if (preResult && !preResult.analysis.isUnique) { if (canPrompt) { @@ -748,6 +749,15 @@ const main = async () => { } } + // Tell the user when we add the synthetic identifier, so the extra column in their output + // isn't a surprise. Only fires for JSON-Lines input that carried no id of its own. + if (preStats.synthesizedSourceRecordId) { + console.log( + 'Detected JSON-Lines input; added synthetic source_record_id to preserve ' + + 'source-record boundaries for extracted nested data.' + ); + } + // The pre-flight prompt above already surfaced any join-key uniqueness issue to the // user, so suppress the library's per-file warning to avoid repeating it. await processDirectory(metadata, dataDir, verbose, `${project_path}/data`, { arrayJoinKeys, suppressJoinKeyWarning: true, normalizedBases, renamePlan: renamePlan ?? undefined }); diff --git a/packages/cli/tests/data.test.ts b/packages/cli/tests/data.test.ts index d89dcb4..580a760 100644 --- a/packages/cli/tests/data.test.ts +++ b/packages/cli/tests/data.test.ts @@ -233,6 +233,29 @@ describe("preAnalyzeDirectory", () => { expect(result!.analysis.isUnique).toBe(false); }); + test("reports a synthesized source_record_id via the out-param for JSON-Lines input", async () => { + // JSON-Lines (one array per line) with no id column → source_record_id is synthesized. + fs.writeFileSync( + path.join(tmpDir, "jsonl.jsonl"), + `[{"trial_index":0},{"trial_index":1}]\n[{"trial_index":0}]` + ); + + const stats: { synthesizedSourceRecordId?: boolean } = {}; + await preAnalyzeDirectory(tmpDir, ["trial_index"], stats); + expect(stats.synthesizedSourceRecordId).toBe(true); + }); + + test("does not report a synthesized source_record_id for a single JSON array", async () => { + fs.writeFileSync( + path.join(tmpDir, "single.json"), + JSON.stringify([{ trial_index: 0 }, { trial_index: 1 }]) + ); + + const stats: { synthesizedSourceRecordId?: boolean } = {}; + await preAnalyzeDirectory(tmpDir, ["trial_index"], stats); + expect(stats.synthesizedSourceRecordId).toBeUndefined(); + }); + test("parses CSV data files as well as JSON", async () => { fs.writeFileSync(path.join(tmpDir, "dupes.csv"), "trial_index\n0\n0"); diff --git a/packages/frontend/src/pages/DataUpload.tsx b/packages/frontend/src/pages/DataUpload.tsx index c3ab049..070c925 100644 --- a/packages/frontend/src/pages/DataUpload.tsx +++ b/packages/frontend/src/pages/DataUpload.tsx @@ -190,15 +190,15 @@ const DataUpload: React.FC = ({ if (type !== 'json') continue; if (name === 'dataset_description.json' || name.endsWith('/dataset_description.json')) continue; try { - // Tag a per-line participant_id for JSON-Lines (a no-op for a single array). - const parsed = parseJsonData(content, { tagParticipantId: true }); + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array). + const parsed = parseJsonData(content, { tagSourceRecordId: true }); if (!Array.isArray(parsed) || parsed.length === 0) continue; - // Mirror generate()'s join-key promotion so a multi-participant JSON-Lines file isn't - // wrongly flagged: trial_index alone repeats across participants, but a synthesized - // participant_id makes (participant_id, trial_index) unique. - const keys = parsed.some((row: any) => row && typeof row === 'object' && 'participant_id' in row) - ? ['participant_id', 'trial_index'] - : ['trial_index']; + // Mirror generate()'s join-key promotion so a multi-record JSON-Lines file isn't wrongly + // flagged: trial_index alone repeats across records, but the identifier column (a + // synthesized source_record_id, else a real participant_id) makes (id, trial_index) unique. + const idColumn = (['source_record_id', 'participant_id'] as const).find((col) => + parsed.some((row: any) => row && typeof row === 'object' && col in row)); + const keys = idColumn ? [idColumn, 'trial_index'] : ['trial_index']; const analysis = analyzeJoinKeys(parsed, keys); if (!analysis.isUnique) { setJoinKeyProblemFile(name); @@ -278,9 +278,9 @@ const DataUpload: React.FC = ({ let mainRows: Array> = []; let mainContent: string | undefined; if (type === 'json') { - // Tag a per-line participant_id for JSON-Lines (a no-op for a single array) so the + // Tag a per-line source_record_id for JSON-Lines (a no-op for a single array) so the // main CSV carries the same join-key column generate() promotes for the sidecars. - const json = parseJsonData(content, { tagParticipantId: true }); + const json = parseJsonData(content, { tagSourceRecordId: true }); if (!Array.isArray(json)) { update(i, { status: 'skipped', detail: 'not a jsPsych trial array' }); continue; diff --git a/packages/metadata/src/index.ts b/packages/metadata/src/index.ts index e9236b7..a1e17ad 100644 --- a/packages/metadata/src/index.ts +++ b/packages/metadata/src/index.ts @@ -438,18 +438,18 @@ export default class JsPsychMetadata { parsed_data = await parseCSV(data); } - let synthesizedParticipantId = false; + let synthesizedSourceRecordId = false; if (ext === 'json') { // Accepts both a single JSON array (standard jsPsych export) and JSON-Lines, // where each line is its own JSON value (JATOS exports one participant array per line). - // Tag JSON-Lines rows with a per-line participant_id: raw jsPsych exports carry no - // per-row participant identifier, so in multi-participant JSONL trial_index alone - // repeats across participants and can't uniquely key the extracted sidecar CSVs. - // The stat records whether we actually invented the id (vs. it already being present), - // so we only describe it as synthetic when it truly is. - const parseStats: { synthesizedParticipantId?: boolean } = {}; - parsed_data = parseJsonData(data, { tagParticipantId: true }, parseStats); - synthesizedParticipantId = parseStats.synthesizedParticipantId === true; + // Tag JSON-Lines rows with a per-line source_record_id: raw jsPsych exports carry no + // per-row identifier, so in multi-record JSONL trial_index alone repeats across records + // and can't uniquely key the extracted sidecar CSVs. The stat records whether we actually + // invented the id (vs. the data already carrying one), so we only describe it as + // synthetic when it truly is. + const parseStats: { synthesizedSourceRecordId?: boolean } = {}; + parsed_data = parseJsonData(data, { tagSourceRecordId: true }, parseStats); + synthesizedSourceRecordId = parseStats.synthesizedSourceRecordId === true; } if (!Array.isArray(parsed_data)) { @@ -469,17 +469,21 @@ export default class JsPsychMetadata { ); } - // When JSON rows carry a participant_id — synthesized per line from JSON-Lines above, or - // already present in the export — promote it to the leading join key (unless the caller - // already listed it). Raw jsPsych exports otherwise have no per-row participant identifier, - // so trial_index alone repeats across participants and can't uniquely key the extracted - // sidecar CSVs; (participant_id, trial_index, …) restores a one-trial-per-key join. CSV - // inputs are left untouched, preserving existing behaviour for tabular sources. - const hasParticipantId = ext === 'json' && - (parsed_data as Array>).some( - (row) => row && typeof row === 'object' && 'participant_id' in row); - if (hasParticipantId && !this.arrayJoinKeys.includes('participant_id')) { - this.arrayJoinKeys = ['participant_id', ...this.arrayJoinKeys]; + // When JSON rows carry an identifier column, promote it to the leading join key (unless the + // caller already listed it). Prefer source_record_id (synthesized per line from JSON-Lines + // above) and otherwise fall back to a real participant_id already present in the export. Raw + // jsPsych exports otherwise have no per-row identifier, so trial_index alone repeats across + // records and can't uniquely key the extracted sidecar CSVs; (id, trial_index, …) restores a + // one-trial-per-key join. CSV inputs are left untouched, preserving existing behaviour for + // tabular sources. + const rows = parsed_data as Array>; + const hasColumn = (col: string) => + ext === 'json' && rows.some((row) => row && typeof row === 'object' && col in row); + const idColumn = hasColumn('source_record_id') ? 'source_record_id' + : hasColumn('participant_id') ? 'participant_id' + : undefined; + if (idColumn && !this.arrayJoinKeys.includes(idColumn)) { + this.arrayJoinKeys = [idColumn, ...this.arrayJoinKeys]; } // Callers that already surface join-key uniqueness to the user (e.g. the CLI's @@ -492,7 +496,7 @@ export default class JsPsychMetadata { await this.generateObservation(observation); } - // Only when WE synthesized participant_id (it wasn't in the source) do we own its + // Only when WE synthesized source_record_id (it wasn't in the source) do we own its // description. As an identifier/join-key column it isn't plugin-documented, so per-trial // processing leaves it with only "unknown" plugin descriptions that getList() strips to an // empty {} (an object with no @type → OBJECT_TYPE_MISSING). Give it one explicit @@ -500,11 +504,11 @@ export default class JsPsychMetadata { // mistakes it for a real subject ID. A pre-existing participant_id is left untouched — its // meaning is the experiment's, not ours. Done before updateMetadata so a caller-supplied // metadata override still wins. - if (synthesizedParticipantId && this.containsVariable('participant_id')) { - const existing = this.getVariable('participant_id') as VariableFields; + if (synthesizedSourceRecordId && this.containsVariable('source_record_id')) { + const existing = this.getVariable('source_record_id') as VariableFields; this.setVariable({ ...existing, - description: { default: 'Synthetic participant identifier (0-based), assigned one per source record (one participant per JSON-Lines line) because the raw data carried no participant column. NOT a real subject ID from the experiment — it only orders/links records as they appeared in the source file, and serves as a join key connecting each trial to its extracted array/object rows.' }, + description: { default: 'Synthetic source-record identifier (0-based), assigned one per source record (one JSON-Lines line, which is usually but not always one participant) because the raw data carried no identifier column. NOT a real subject ID from the experiment — it only orders/links records as they appeared in the source file, and serves as a join key connecting each trial to its extracted array/object rows.' }, }); } @@ -951,7 +955,7 @@ export default class JsPsychMetadata { // Declare the join-key columns this table carries that aren't known yet: element_index, plus // any ancestor element-index keys passed down from an enclosing array (qualified - // ".element_index"). Pre-existing keys (trial_index, participant_id, …) are already + // ".element_index"). Pre-existing keys (trial_index, source_record_id, …) are already // declared and are skipped. if (!this.containsVariable("element_index")) { this.setVariable({ diff --git a/packages/metadata/src/utils.ts b/packages/metadata/src/utils.ts index b77c8ea..5581395 100644 --- a/packages/metadata/src/utils.ts +++ b/packages/metadata/src/utils.ts @@ -89,20 +89,21 @@ export function tryParseJSON(value: string): any | null { * back to line-by-line parsing, flattening any per-line arrays into one observation * stream. Throws a descriptive error when the input is neither valid JSON nor valid JSONL. * - * When `tagParticipantId` is set, `stats.synthesizedParticipantId` is set to true iff a - * participant_id was actually stamped onto at least one row (i.e. the data did not already - * carry one). Callers use this to describe the column honestly — a synthesized id is not a - * real subject identifier and must not be presented as one. + * When `tagSourceRecordId` is set, `stats.synthesizedSourceRecordId` is set to true iff a + * source_record_id was actually stamped onto at least one row (i.e. the data did not already + * carry a source_record_id or a real participant_id). Callers use this to describe the column + * honestly — a synthesized id marks the source record/line, not a real subject identifier, and + * must not be presented as one. */ export function parseJsonData( content: string, - options: { tagParticipantId?: boolean } = {}, - stats?: { synthesizedParticipantId?: boolean } + options: { tagSourceRecordId?: boolean } = {}, + stats?: { synthesizedSourceRecordId?: boolean } ): any { // Fast path: a single, well-formed JSON document. Covers the standard single array // (including pretty-printed/multi-line) with no behaviour change for existing callers. - // Note: tagParticipantId never applies here — a single document has no line boundaries - // to identify participants by, so its rows are returned untouched. + // Note: tagSourceRecordId never applies here — a single document has no line boundaries + // to identify source records by, so its rows are returned untouched. const whole = tryParseJSON(content); if (whole !== null) return whole; @@ -111,7 +112,7 @@ export function parseJsonData( const lines = content.split(/\r?\n/); const out: any[] = []; let parsedAny = false; - let participantIndex = 0; + let recordIndex = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (!line) continue; @@ -125,22 +126,25 @@ export function parseJsonData( } parsedAny = true; const observations = Array.isArray(value) ? value : [value]; - // In JSON-Lines each line is one participant's submission (JATOS-style export). The line - // boundary is the only participant identifier these raw jsPsych exports carry, so — when - // asked — stamp every object observation from this line with a 0-based participant_id - // before that boundary is lost in the flattened stream. This lets nested array/object - // extraction form a unique (participant_id, trial_index) join key. Existing participant_id - // values are left untouched; non-object lines (bare primitives) can't carry the tag. - if (options.tagParticipantId) { + // In JSON-Lines each line is typically one participant's submission (JATOS-style export), + // but a line is only guaranteed to be one *source record* — the per-line boundary is the + // only identifier these raw jsPsych exports carry. So — when asked — stamp every object + // observation from this line with a 0-based source_record_id before that boundary is lost + // in the flattened stream. This lets nested array/object extraction form a unique + // (source_record_id, trial_index) join key. Rows that already carry a source_record_id or a + // real participant_id are left untouched (the experiment's own id already groups them); + // non-object lines (bare primitives) can't carry the tag. + if (options.tagSourceRecordId) { for (const obs of observations) { - if (obs !== null && typeof obs === "object" && !Array.isArray(obs) && !("participant_id" in obs)) { - obs.participant_id = participantIndex; - if (stats) stats.synthesizedParticipantId = true; + if (obs !== null && typeof obs === "object" && !Array.isArray(obs) && + !("source_record_id" in obs) && !("participant_id" in obs)) { + obs.source_record_id = recordIndex; + if (stats) stats.synthesizedSourceRecordId = true; } } } out.push(...observations); - participantIndex++; + recordIndex++; } if (!parsedAny) { diff --git a/packages/metadata/tests/jsonl-ingestion.test.ts b/packages/metadata/tests/jsonl-ingestion.test.ts index 772acd0..59262a5 100644 --- a/packages/metadata/tests/jsonl-ingestion.test.ts +++ b/packages/metadata/tests/jsonl-ingestion.test.ts @@ -44,32 +44,42 @@ describe("parseJsonData", () => { }); }); -describe("parseJsonData participant_id tagging", () => { - // Raw jsPsych exports carry no per-row participant identifier; for multi-participant - // JSON-Lines the line boundary is the only one available, so tagParticipantId stamps a - // 0-based participant_id per line. This lets nested array/object extraction form a unique - // (participant_id, trial_index) join key. - test("tags a per-line participant_id across JSON-Lines records", () => { +describe("parseJsonData source_record_id tagging", () => { + // Raw jsPsych exports carry no per-row identifier; for multi-record JSON-Lines the line + // boundary is the only one available, so tagSourceRecordId stamps a 0-based source_record_id + // per line (a line is usually one participant, but only guaranteed to be one source record). + // This lets nested array/object extraction form a unique (source_record_id, trial_index) join key. + test("tags a per-line source_record_id across JSON-Lines records", () => { const p1 = [{ trial_index: 0 }, { trial_index: 1 }]; const p2 = [{ trial_index: 0 }]; const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; - expect(parseJsonData(jsonl, { tagParticipantId: true })).toEqual([ - { trial_index: 0, participant_id: 0 }, - { trial_index: 1, participant_id: 0 }, - { trial_index: 0, participant_id: 1 }, + expect(parseJsonData(jsonl, { tagSourceRecordId: true })).toEqual([ + { trial_index: 0, source_record_id: 0 }, + { trial_index: 1, source_record_id: 0 }, + { trial_index: 0, source_record_id: 1 }, ]); }); test("leaves a single JSON array untouched (no line boundaries to tag by)", () => { const rows = [{ trial_index: 0 }, { trial_index: 1 }]; - expect(parseJsonData(JSON.stringify(rows), { tagParticipantId: true })).toEqual(rows); + expect(parseJsonData(JSON.stringify(rows), { tagSourceRecordId: true })).toEqual(rows); }); - test("does not overwrite an existing participant_id", () => { + test("does not overwrite an existing source_record_id", () => { + const jsonl = `[{"source_record_id":"R7","trial_index":0}]\n[{"trial_index":0}]`; + expect(parseJsonData(jsonl, { tagSourceRecordId: true })).toEqual([ + { source_record_id: "R7", trial_index: 0 }, + { trial_index: 0, source_record_id: 1 }, + ]); + }); + + test("defers to a real participant_id: tags neither the row that has one", () => { + // A line that already carries a real participant_id is left as-is (the experiment's id + // already groups it); other lines still get a synthesized source_record_id. const jsonl = `[{"participant_id":"P7","trial_index":0}]\n[{"trial_index":0}]`; - expect(parseJsonData(jsonl, { tagParticipantId: true })).toEqual([ + expect(parseJsonData(jsonl, { tagSourceRecordId: true })).toEqual([ { participant_id: "P7", trial_index: 0 }, - { trial_index: 0, participant_id: 1 }, + { trial_index: 0, source_record_id: 1 }, ]); }); @@ -107,9 +117,9 @@ describe("generate() ingests JSON-Lines end to end", () => { expect(rt.maxValue).toBe(720); }); - test("synthesizes participant_id so multi-participant JSON-Lines sidecars join uniquely", async () => { - // Both participants restart trial_index at 0, so without a participant identifier the two - // trial-0 view_history rows would collide on (trial_index, element_index). + test("synthesizes source_record_id so multi-record JSON-Lines sidecars join uniquely", async () => { + // Both records restart trial_index at 0, so without a per-line identifier the two trial-0 + // view_history rows would collide on (trial_index, element_index). const p1 = [{ trial_type: "html-keyboard-response", trial_index: 0, view_history: [{ page: 0 }, { page: 1 }] }]; const p2 = [{ trial_type: "html-keyboard-response", trial_index: 0, view_history: [{ page: 0 }] }]; const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; @@ -117,29 +127,29 @@ describe("generate() ingests JSON-Lines end to end", () => { const meta = new JsPsychMetadata(); await meta.generate(jsonl, {}, "json"); - // participant_id is promoted to the leading join key. - expect(meta.getArrayJoinKeys()).toEqual(["participant_id", "trial_index"]); + // source_record_id is promoted to the leading join key. + expect(meta.getArrayJoinKeys()).toEqual(["source_record_id", "trial_index"]); // It serialises with a plain-text description (not an empty {} that would trip // Psych-DS's OBJECT_TYPE_MISSING) that makes its synthetic origin unmistakable, so a // downstream user can't mistake it for a real subject ID. - const pid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "participant_id"); - expect(typeof pid.description).toBe("string"); - expect(pid.description.toLowerCase()).toContain("synthetic"); - expect(pid.description.toLowerCase()).toContain("not a real subject id"); + const sid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "source_record_id"); + expect(typeof sid.description).toBe("string"); + expect(sid.description.toLowerCase()).toContain("synthetic"); + expect(sid.description.toLowerCase()).toContain("not a real subject id"); - // Every extracted view_history row carries participant_id, so the composite key is unique. + // Every extracted view_history row carries source_record_id, so the composite key is unique. const rows = meta.getExtractedArrays().get("view_history") as Array>; expect(rows.length).toBe(3); - const keyset = new Set(rows.map((r) => `${r.participant_id}|${r.trial_index}|${r.element_index}`)); + const keyset = new Set(rows.map((r) => `${r.source_record_id}|${r.trial_index}|${r.element_index}`)); expect(keyset.size).toBe(rows.length); - expect(rows.map((r) => r.participant_id).sort()).toEqual([0, 0, 1]); + expect(rows.map((r) => r.source_record_id).sort()).toEqual([0, 0, 1]); }); - test("does not relabel a real participant_id already present in the data", async () => { + test("uses a real participant_id as the join key and does not synthesize a source_record_id", async () => { // Each line already carries its own participant_id — a real identifier. Promotion should - // still use it as a join key, but we must not overwrite it with a "synthetic" description - // (that would misrepresent a genuine subject ID). + // use it as the join key, no source_record_id should be synthesized, and nothing should be + // relabeled with a "synthetic" description (that would misrepresent a genuine subject ID). const p1 = [{ trial_type: "html-keyboard-response", trial_index: 0, participant_id: "sub-007" }]; const p2 = [{ trial_type: "html-keyboard-response", trial_index: 0, participant_id: "sub-008" }]; const jsonl = `${JSON.stringify(p1)}\n${JSON.stringify(p2)}`; @@ -148,12 +158,13 @@ describe("generate() ingests JSON-Lines end to end", () => { await meta.generate(jsonl, {}, "json"); expect(meta.getArrayJoinKeys()).toEqual(["participant_id", "trial_index"]); + expect(meta.containsVariable("source_record_id")).toBe(false); const pid = meta.getMetadata().variableMeasured.find((v: any) => v.name === "participant_id"); const desc = typeof pid.description === "string" ? pid.description : JSON.stringify(pid.description); expect(desc.toLowerCase()).not.toContain("synthetic"); }); - test("does not promote participant_id for a single-array export that lacks one", async () => { + test("does not promote an identifier for a single-array export that lacks one", async () => { const rows = [ { trial_type: "html-keyboard-response", trial_index: 0, rt: 1 }, { trial_type: "html-keyboard-response", trial_index: 1, rt: 2 },