diff --git a/.changeset/drop-unnamed-columns.md b/.changeset/drop-unnamed-columns.md new file mode 100644 index 0000000..c7a270f --- /dev/null +++ b/.changeset/drop-unnamed-columns.md @@ -0,0 +1,14 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +"frontend": patch +--- + +Drop unnamed columns so R-exported datasets validate. R's `write.csv` (with the default `row.names = TRUE`) prepends an unnamed row-index column, so the exported CSV header starts with a bare comma — an empty-string column name. Psych-DS variables require a name, so the column can never appear in `variableMeasured`; left in the on-disk CSV it fails validation with `CSV_COLUMN_MISSING_FROM_METADATA`. + +The strip now lives in the shared data-file path so the CLI and frontend behave identically: + +- `generate()` strips empty/whitespace-only columns from the parsed data up front, with a single warning instead of per-row spam (keeps `variableMeasured` clean and standalone library use safe), via a new exported `stripUnnamedColumns` helper. +- `buildPsychDSDataFiles` strips the main table before emitting it: a clean CSV keeps its exact bytes (verbatim `mainContent`), while a file with an unnamed column is re-serialised from the cleaned rows. Both the CLI (rename-plan and non-plan paths) and the frontend feed parsed `mainRows`, so the written/zipped/validated CSV always matches the metadata. + +Fixes finding #2 of #109. diff --git a/packages/cli/src/data.ts b/packages/cli/src/data.ts index 9ea0d0d..f10460f 100644 --- a/packages/cli/src/data.ts +++ b/packages/cli/src/data.ts @@ -1,6 +1,6 @@ import fs from "fs"; import path from "path"; -import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata"; +import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata"; import { expandHomeDir, disambiguateFilename, fileStem } from "./utils"; import { PlannedFile } from "./rename"; @@ -350,6 +350,11 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil parsed = json; } + // Rows for the main table, fed to the shared builder / inline write. JSON is already + // parsed above; parse CSV here too so unnamed row-index columns can be stripped (a CSV's + // exact bytes are still preserved via mainContent when nothing is dropped). + const mainRows: Array> = parsed ?? ((await parseCSV(content)) as Array>); + // Resolve the Psych-DS base (keyword-value sequence before "_data.csv"). The index.ts // pre-pass supplies a base for every source file; fall back to the file's own stem when // called directly (e.g. in tests). Never invent a keyword here — if the resolved base @@ -436,9 +441,12 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil }; const mainName = reserve(planned.mainName, 'main output'); + // Drop R-style unnamed row-index columns (same as the shared builder / generate()). A clean + // CSV keeps its exact bytes; JSON or a dirty CSV is serialised from the cleaned rows. + const { rows: cleanedMain, dropped: droppedMain } = stripUnnamedColumns(mainRows); await fs.promises.writeFile( path.join(targetDirectoryPath, mainName), - parsed ? objectsToCSV(parsed, ['trial_index']) : content, + (parsed === null && droppedMain.length === 0) ? content : objectsToCSV(cleanedMain, ['trial_index']), ); if (verbose) console.log(` → wrote ${file} as ${mainName}`); @@ -458,7 +466,7 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil // disambiguation, and CSV building — the same implementation the browser flow uses. const built = buildPsychDSDataFiles({ base, - mainRows: parsed ?? [], + mainRows, mainContent: parsed ? undefined : content, extractedArrays, extractedObjects, diff --git a/packages/cli/tests/data.test.ts b/packages/cli/tests/data.test.ts index 8526d58..f0ffac1 100644 --- a/packages/cli/tests/data.test.ts +++ b/packages/cli/tests/data.test.ts @@ -491,6 +491,52 @@ describe("processDirectory JSON → CSV conversion", () => { processDirectory(new JsPsychMetadata(), srcDir, false, dataDir, { normalizedBases, renamePlan }) ).rejects.toThrow(RenamePlanError); }); + + // #109 finding 2: R's write.csv prepends an unnamed row-index column (header starts with a bare + // comma). The written CSV must drop it so it matches variableMeasured, on both write paths. + test("non-planned path drops an unnamed leading column from the written CSV", async () => { + const srcDir = path.join(tmpDir, "src"); + const dataDir = path.join(tmpDir, "data"); + fs.mkdirSync(srcDir); + fs.mkdirSync(dataDir); + fs.writeFileSync( + path.join(srcDir, "subject-1_data.csv"), + ",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512", + ); + + const metadata = new JsPsychMetadata(); + await processDirectory(metadata, srcDir, false, dataDir); + + const csv = fs.readFileSync(path.join(dataDir, "subject-1_data.csv"), "utf8"); + const header = csv.split(/\r?\n/)[0].split(","); + expect(header).not.toContain(""); + expect(header).toEqual(expect.arrayContaining(["trial_type", "rt"])); + // On-disk header and variableMeasured agree — the whole point. + const names = (metadata.getMetadata()["variableMeasured"] as any[]).map((v) => v.name); + for (const col of header) expect(names).toContain(col); + }); + + test("rename-plan path also drops an unnamed leading column", async () => { + const srcDir = path.join(tmpDir, "src"); + const dataDir = path.join(tmpDir, "data"); + fs.mkdirSync(srcDir); + fs.mkdirSync(dataDir); + fs.writeFileSync( + path.join(srcDir, "raw.csv"), + ",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512", + ); + + const key = path.resolve(srcDir, "raw.csv"); + const columns = await analyzeOutputColumns(srcDir); + const normalizedBases = new Map([[key, "subject-9"]]); + const renamePlan = planRenames(columns.map((c) => ({ key: c.key, base: "subject-9", arrayColumns: c.arrayColumns, objectColumns: c.objectColumns }))); + + const { failed } = await processDirectory(new JsPsychMetadata(), srcDir, false, dataDir, { normalizedBases, renamePlan }); + + expect(failed).toBe(0); + const csv = fs.readFileSync(path.join(dataDir, "subject-9_data.csv"), "utf8"); + expect(csv.split(/\r?\n/)[0].split(",")).not.toContain(""); + }); }); // #109 finding #3: a fully-flagged headless run must not block on the interactive join-key prompt. diff --git a/packages/frontend/src/pages/DataUpload.tsx b/packages/frontend/src/pages/DataUpload.tsx index aa36edf..85439ec 100644 --- a/packages/frontend/src/pages/DataUpload.tsx +++ b/packages/frontend/src/pages/DataUpload.tsx @@ -1,6 +1,6 @@ import { useState, useRef, useEffect } from 'react'; import JSZip from 'jszip'; -import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata'; +import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, parseCSV, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata'; import PageHeader from '../components/PageHeader'; import styles from './DataUpload.module.css'; @@ -276,6 +276,9 @@ const DataUpload: React.FC = ({ mainRows = json; } else { mainContent = content; + // Parse CSV rows too so the builder can drop R-style unnamed row-index columns; a clean + // CSV still keeps its exact bytes (mainContent is used verbatim when nothing is dropped). + mainRows = (await parseCSV(content)) as Array>; } const built = buildPsychDSDataFiles({ diff --git a/packages/frontend/tests/dataUploadConversion.test.ts b/packages/frontend/tests/dataUploadConversion.test.ts new file mode 100644 index 0000000..23f8771 --- /dev/null +++ b/packages/frontend/tests/dataUploadConversion.test.ts @@ -0,0 +1,30 @@ +import { parseCSV, buildPsychDSDataFiles, deriveFallbackBase } from "@jspsych/metadata"; + +// Mirrors the CSV branch of DataUpload.runGenerate: parse the uploaded CSV into mainRows and +// hand it to the shared builder. Guards the frontend wiring (the parseCSV call + builder usage) +// that lets an R-style export — with an unnamed row-index column — produce a clean converted +// data/*.csv, without rendering the stateful upload component. +describe("frontend CSV → Psych-DS conversion (the runGenerate path)", () => { + it("drops an unnamed row-index column from the converted main CSV", async () => { + const content = ",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512"; + const mainRows = (await parseCSV(content)) as Array>; + + const built = buildPsychDSDataFiles({ + base: deriveFallbackBase("sub01"), + mainRows, + mainContent: content, + }); + + const main = built.find((f) => f.kind === "main")!; + const header = main.content.split(/\r?\n/)[0].split(","); + expect(header).not.toContain(""); + expect(header).toEqual(["trial_type", "rt"]); + }); + + it("keeps a well-formed CSV's bytes verbatim", async () => { + const content = "trial_type,rt\njsPsych-html-keyboard-response,450"; + const mainRows = (await parseCSV(content)) as Array>; + const built = buildPsychDSDataFiles({ base: deriveFallbackBase("sub01"), mainRows, mainContent: content }); + expect(built.find((f) => f.kind === "main")!.content).toBe(content); + }); +}); diff --git a/packages/metadata/src/index.ts b/packages/metadata/src/index.ts index 7a7edee..0d64edd 100644 --- a/packages/metadata/src/index.ts +++ b/packages/metadata/src/index.ts @@ -1,6 +1,6 @@ import { AuthorFields, AuthorsMap } from "./AuthorsMap"; import { PluginCache } from "./PluginCache"; -import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS } from "./utils"; +import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils"; import { VariableFields, VariablesMap } from "./VariablesMap"; /** @@ -446,6 +446,19 @@ export default class JsPsychMetadata { throw new Error("Parsed data is not in correct format: Expected an array of observations"); } + // Drop unnamed columns (empty/whitespace-only headers) before processing. These can't be + // represented in variableMeasured (Psych-DS requires a name) and are typically R's row-index + // column. Stripping here removes them from variableMeasured and avoids a per-row warning in + // setVariable. buildPsychDSDataFiles mirrors this on the written CSV so file and metadata sync. + const { dropped } = stripUnnamedColumns(parsed_data as Array>); + if (dropped.length > 0) { + console.warn( + `Dropped ${dropped.length} unnamed column${dropped.length > 1 ? "s" : ""} from the data — ` + + `Psych-DS requires every column to have a name (usually a row-index column added by R's ` + + `write.csv). Excluded from variableMeasured.` + ); + } + // Callers that already surface join-key uniqueness to the user (e.g. the CLI's // interactive pre-analysis prompt) can suppress this warning to avoid repeating it // once per file. @@ -1034,5 +1047,5 @@ export { AuthorFields, VariableFields } -export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils"; +export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils"; export type { JoinKeyAnalysis, PsychDSDataFile, BuildPsychDSDataFilesArgs } from "./utils"; diff --git a/packages/metadata/src/utils.ts b/packages/metadata/src/utils.ts index 49f36aa..279cf15 100644 --- a/packages/metadata/src/utils.ts +++ b/packages/metadata/src/utils.ts @@ -300,6 +300,33 @@ export function disambiguateArrayFilename(base: string, used: Set): stri return candidate; } +/** + * Removes columns whose name is empty or whitespace-only from every row, in place, + * and reports which names were dropped. R's `write.csv` (with the default + * `row.names = TRUE`) prepends an unnamed row-index column, which surfaces as an + * empty-string ("") header. Such a column can never be represented in a Psych-DS + * `variableMeasured` entry (a name is required), so leaving it in produces a dataset + * that fails validation with CSV_COLUMN_MISSING_FROM_METADATA. Dropping it up front — + * once, rather than warning per row — keeps the generated metadata and the written + * CSV consistent. Returns the same `rows` reference for convenient chaining. + */ +export function stripUnnamedColumns( + rows: Array> +): { rows: Array>; dropped: string[] } { + const unnamed = new Set(); + for (const row of rows) { + for (const key of Object.keys(row)) { + if (key.trim() === "") unnamed.add(key); + } + } + if (unnamed.size > 0) { + for (const row of rows) { + for (const key of unnamed) delete row[key]; + } + } + return { rows, dropped: [...unnamed] }; +} + /** A single converted Psych-DS output file produced by {@link buildPsychDSDataFiles}. */ export interface PsychDSDataFile { /** Psych-DS-compliant filename, relative to the `data/` directory. */ @@ -313,12 +340,17 @@ export interface PsychDSDataFile { export interface BuildPsychDSDataFilesArgs { /** Compliant filename base (keyword-value sequence before `_data.csv`), e.g. "id-sub01". */ base: string; - /** Parsed rows of the main data file. Serialised to CSV unless `mainContent` is given. */ + /** + * Parsed rows of the main data file. Serialised to CSV unless `mainContent` is given and + * no unnamed columns are dropped. Always supply this (parse CSV inputs too) so unnamed + * row-index columns can be detected and stripped. + */ mainRows: Array>; /** - * Pre-rendered CSV for the main file, used verbatim instead of serialising `mainRows`. - * Pass this when the source is already CSV so its exact bytes (column order, quoting) are - * preserved; `mainRows` may be empty in that case. + * Pre-rendered CSV for the main file, used verbatim instead of serialising `mainRows` — + * but only when no unnamed columns are dropped. Pass this when the source is already CSV + * so a clean file keeps its exact bytes (column order, quoting); a file with an unnamed + * column is re-serialised from the cleaned `mainRows` instead. */ mainContent?: string; /** Array-column rows keyed by column name (from `JsPsychMetadata.getExtractedArrays`). */ @@ -369,9 +401,15 @@ export function buildPsychDSDataFiles(args: BuildPsychDSDataFilesArgs): PsychDSD // Main table. Disambiguate up front so a later file sharing this base doesn't overwrite it. const mainName = reserve(disambiguateArrayFilename(`${base}_data.csv`, usedArrayFilenames)); + // Drop unnamed columns (R's row-index artifact) so the written CSV matches variableMeasured, + // which generate() also strips. A clean CSV input keeps its exact bytes (mainContent verbatim); + // a dirty one is re-serialised from the cleaned rows, as is JSON (no mainContent given). + const { rows: cleanedMainRows, dropped: droppedMain } = stripUnnamedColumns(mainRows); out.push({ filename: mainName, - content: mainContent ?? objectsToCSV(mainRows, ['trial_index']), + content: (mainContent !== undefined && droppedMain.length === 0) + ? mainContent + : objectsToCSV(cleanedMainRows, ['trial_index']), kind: 'main', }); diff --git a/packages/metadata/tests/metadata-module.test.ts b/packages/metadata/tests/metadata-module.test.ts index 6108ec7..4acea11 100644 --- a/packages/metadata/tests/metadata-module.test.ts +++ b/packages/metadata/tests/metadata-module.test.ts @@ -806,3 +806,68 @@ describe("boolean vs string true/false levels", () => { } }); }); + +describe("unnamed columns are dropped (#109 finding 2)", () => { + let jsPsychMetadata: JsPsychMetadata; + let warnSpy: jest.SpyInstance; + + beforeEach(() => { + jsPsychMetadata = new JsPsychMetadata(); + warnSpy = jest.spyOn(console, "warn").mockImplementation(() => {}); + }); + + afterEach(() => { + warnSpy.mockRestore(); + }); + + // R's write.csv(row.names=TRUE) prepends an unnamed row-index column, so the header + // starts with a bare comma -> an empty-string ("") column name. + test("an empty-header (row-index) column is excluded from variableMeasured", async () => { + const csv = [ + ",trial_type,rt,response", + "1,jsPsych-html-keyboard-response,450,f", + "2,jsPsych-html-keyboard-response,512,j", + "3,jsPsych-html-keyboard-response,389,f", + ].join("\n"); + + await jsPsychMetadata.generate(csv, {}, "csv"); + + const variableMeasured = jsPsychMetadata.getMetadata()["variableMeasured"] as any[]; + const outputNames = variableMeasured.map((v) => v.name); + + expect(outputNames).not.toContain(""); + expect(outputNames).toEqual(expect.arrayContaining(["trial_type", "rt", "response"])); + }); + + test("warns once, not once per row, when an unnamed column is dropped", async () => { + const csv = [ + ",trial_type,rt", + "1,jsPsych-html-keyboard-response,450", + "2,jsPsych-html-keyboard-response,512", + "3,jsPsych-html-keyboard-response,389", + "4,jsPsych-html-keyboard-response,401", + ].join("\n"); + + await jsPsychMetadata.generate(csv, {}, "csv"); + + const dropWarnings = warnSpy.mock.calls.filter((args) => + String(args[0]).includes("unnamed column") + ); + expect(dropWarnings).toHaveLength(1); + }); + + test("well-formed CSV with no unnamed columns does not warn", async () => { + const csv = [ + "trial_type,rt", + "jsPsych-html-keyboard-response,450", + "jsPsych-html-keyboard-response,512", + ].join("\n"); + + await jsPsychMetadata.generate(csv, {}, "csv"); + + const dropWarnings = warnSpy.mock.calls.filter((args) => + String(args[0]).includes("unnamed column") + ); + expect(dropWarnings).toHaveLength(0); + }); +}); diff --git a/packages/metadata/tests/psychds-data-files.test.ts b/packages/metadata/tests/psychds-data-files.test.ts index 6a786ee..cf8d912 100644 --- a/packages/metadata/tests/psychds-data-files.test.ts +++ b/packages/metadata/tests/psychds-data-files.test.ts @@ -2,6 +2,7 @@ import { deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, + stripUnnamedColumns, } from "../src/utils"; describe("deriveFallbackBase", () => { @@ -74,3 +75,65 @@ describe("buildPsychDSDataFiles", () => { expect(() => buildPsychDSDataFiles({ base: "Bad Base", mainRows: [{ trial_index: 0 }] })).toThrow(); }); }); + +describe("stripUnnamedColumns", () => { + it("removes empty and whitespace-only keys from every row and reports them", () => { + const rows = [ + { "": "1", " ": "x", trial_index: 0, rt: 200 }, + { "": "2", " ": "y", trial_index: 1, rt: 350 }, + ]; + const { rows: out, dropped } = stripUnnamedColumns(rows); + expect(dropped.sort()).toEqual(["", " "]); + expect(out).toBe(rows); // mutates in place, returns same reference + expect(Object.keys(out[0])).toEqual(["trial_index", "rt"]); + }); + + it("is a no-op (empty dropped list) when all columns are named", () => { + const rows = [{ trial_index: 0, rt: 200 }]; + const { dropped } = stripUnnamedColumns(rows); + expect(dropped).toEqual([]); + expect(Object.keys(rows[0])).toEqual(["trial_index", "rt"]); + }); +}); + +describe("buildPsychDSDataFiles drops unnamed columns (#109 finding 2)", () => { + // R's write.csv(row.names=TRUE) prepends an unnamed row-index column. The builder must drop it + // so the written main CSV matches variableMeasured (generate() drops it from the metadata). + it("re-serialises a CSV main table without the unnamed column, ignoring verbatim mainContent", () => { + const rows = [ + { "": "1", trial_type: "x", rt: 450 }, + { "": "2", trial_type: "x", rt: 512 }, + ]; + const built = buildPsychDSDataFiles({ + base: "subject-sub01", + mainRows: rows, + mainContent: ",trial_type,rt\n1,x,450\n2,x,512", // dirty verbatim — must NOT be used as-is + }); + const main = built.find((f) => f.kind === "main")!; + const header = main.content.split(/\r?\n/)[0].split(","); + expect(header).not.toContain(""); + expect(header).toEqual(["trial_type", "rt"]); + }); + + it("keeps a clean CSV's mainContent byte-for-byte (nothing dropped)", () => { + const verbatim = "trial_type,rt\nx,450\nx,512"; + const built = buildPsychDSDataFiles({ + base: "subject-sub01", + mainRows: [ + { trial_type: "x", rt: 450 }, + { trial_type: "x", rt: 512 }, + ], + mainContent: verbatim, + }); + expect(built.find((f) => f.kind === "main")!.content).toBe(verbatim); + }); + + it("drops unnamed columns from JSON-sourced main rows too", () => { + const built = buildPsychDSDataFiles({ + base: "subject-sub01", + mainRows: [{ "": "1", trial_index: 0, rt: 1 }], + }); + const header = built.find((f) => f.kind === "main")!.content.split(/\r?\n/)[0].split(","); + expect(header).not.toContain(""); + }); +});