From 57b5bcf121076d7d71d2f1d621d142830231e5d6 Mon Sep 17 00:00:00 2001 From: Mandyx22 <1915537307@qq.com> Date: Tue, 16 Jun 2026 16:49:12 -0400 Subject: [PATCH] fix(metadata,cli): drop unnamed columns so R-exported CSVs validate (#109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R's write.csv(row.names=TRUE) prepends an unnamed row-index column, so the header starts with a bare comma (an empty-string column name). Such a column can't appear in variableMeasured (Psych-DS requires a name), so the library skipped it while warning once per row, and the CLI copied the CSV verbatim — leaving a column with no metadata and failing validation with CSV_COLUMN_MISSING_FROM_METADATA. generate() now strips empty/whitespace-only columns from parsed data up front with a single warning, via a new exported stripUnnamedColumns helper. The CLI re-serialises a .csv input that has unnamed columns (preserving column order) so the written file matches variableMeasured; well-formed CSVs are still copied byte-for-byte. Fixes finding #2 of #109. Co-Authored-By: Claude Opus 4.8 --- .changeset/drop-unnamed-columns.md | 8 +++ packages/cli/src/data.ts | 25 +++++-- packages/cli/tests/data.test.ts | 63 +++++++++++++++++- packages/metadata/src/index.ts | 17 ++++- packages/metadata/src/utils.ts | 27 ++++++++ .../metadata/tests/metadata-module.test.ts | 66 +++++++++++++++++++ 6 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 .changeset/drop-unnamed-columns.md diff --git a/.changeset/drop-unnamed-columns.md b/.changeset/drop-unnamed-columns.md new file mode 100644 index 0000000..1b6117b --- /dev/null +++ b/.changeset/drop-unnamed-columns.md @@ -0,0 +1,8 @@ +--- +"@jspsych/metadata": patch +"@jspsych/metadata-cli": patch +--- + +Drop unnamed columns instead of producing a dataset that can't validate. R's `write.csv` (with the default `row.names = TRUE`) prepends an unnamed row-index column, so the exported CSV header starts with a bare comma — an empty-string column name. Psych-DS variables require a name, so the column could never appear in `variableMeasured`; the library previously skipped it (logging `Name field is missing. Variable not added.` once per row) while the CLI copied the CSV verbatim, leaving a column in the file with no metadata entry and failing validation with `CSV_COLUMN_MISSING_FROM_METADATA`. + +`generate()` now strips empty/whitespace-only columns from the parsed data up front, with a single warning instead of per-row spam, via a new exported `stripUnnamedColumns` helper. The CLI mirrors this when writing data files: a `.csv` input that contains unnamed columns is re-serialised without them (column order preserved) so the on-disk file matches `variableMeasured`; well-formed CSVs are still copied byte-for-byte. Fixes finding #2 of #109. diff --git a/packages/cli/src/data.ts b/packages/cli/src/data.ts index 46f60b8..e701ca3 100644 --- a/packages/cli/src/data.ts +++ b/packages/cli/src/data.ts @@ -1,6 +1,6 @@ import fs from "fs"; import path from "path"; -import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, deriveArrayFilename, disambiguateArrayFilename, objectsToCSV, isValidPsychDSDataFilename } from "@jspsych/metadata"; +import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, deriveArrayFilename, disambiguateArrayFilename, objectsToCSV, isValidPsychDSDataFilename, stripUnnamedColumns } from "@jspsych/metadata"; import { expandHomeDir, disambiguateFilename, fileStem } from "./utils"; import { PlannedFile } from "./rename"; @@ -331,12 +331,29 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil } await fs.promises.writeFile(path.join(rawDir, rawName), content); + // Mirror generate()'s unnamed-column drop so the converted CSV matches variableMeasured. + stripUnnamedColumns(parsed); await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), objectsToCSV(parsed, ['trial_index'])); if (verbose) console.log(` → converted ${file} to ${mainName} (raw saved to raw/${rawName})`); } else { - // .csv input is already CSV; write it out under its Psych-DS-compliant name. - await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), content); - if (verbose) console.log(` → wrote ${file} as ${mainName}`); + // .csv input is already CSV. R-style exports (write.csv with row.names=TRUE) prepend an + // unnamed row-index column whose empty header can't be represented in variableMeasured; + // generate() drops it from the metadata, so the written CSV must drop it too or the dataset + // fails validation (CSV_COLUMN_MISSING_FROM_METADATA). When unnamed columns are present we + // re-serialise the cleaned rows (preserving column order); otherwise we copy verbatim so + // well-formed files are written byte-for-byte unchanged. + const csvRows = (await parseCSV(content)) as Array>; + const { rows: cleanedRows, dropped } = stripUnnamedColumns(csvRows); + if (dropped.length > 0) { + console.log( + ` ! dropped ${dropped.length} unnamed column${dropped.length > 1 ? "s" : ""} from "${file}" ` + + `(row-index artifact); writing cleaned CSV as ${mainName}.` + ); + await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), objectsToCSV(cleanedRows, [])); + } else { + await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), content); + if (verbose) console.log(` → wrote ${file} as ${mainName}`); + } } // Sidecar CSVs: one per array-of-objects column and one per plain-object column diff --git a/packages/cli/tests/data.test.ts b/packages/cli/tests/data.test.ts index e14ff01..c0b1b05 100644 --- a/packages/cli/tests/data.test.ts +++ b/packages/cli/tests/data.test.ts @@ -1,7 +1,7 @@ import fs from "fs"; import os from "os"; import path from "path"; -import JsPsychMetadata from "@jspsych/metadata"; +import JsPsychMetadata, { stripUnnamedColumns } from "@jspsych/metadata"; import { processOptions, loadMetadata, saveTextToPath, processDirectory, preAnalyzeDirectory, enumerateDataFiles, analyzeOutputColumns, RenamePlanError } from "../src/data"; import { planRenames } from "../src/rename"; @@ -491,4 +491,65 @@ describe("processDirectory JSON → CSV conversion", () => { processDirectory(new JsPsychMetadata(), srcDir, false, dataDir, { normalizedBases, renamePlan }) ).rejects.toThrow(RenamePlanError); }); + + // #109 finding 2: R's write.csv prepends an unnamed row-index column (header starts with a + // bare comma). generate() drops it from variableMeasured, so the written CSV must drop it too + // or the dataset fails validation with CSV_COLUMN_MISSING_FROM_METADATA. + test("drops an unnamed leading column from the written CSV so it matches variableMeasured", async () => { + const srcDir = path.join(tmpDir, "src"); + const dataDir = path.join(tmpDir, "data"); + fs.mkdirSync(srcDir); + fs.mkdirSync(dataDir); + // Leading bare comma => empty-string header, exactly as R's write.csv(row.names=TRUE) emits. + fs.writeFileSync( + path.join(srcDir, "subject-1_data.csv"), + ",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512", + ); + + const metadata = new JsPsychMetadata(); + await processDirectory(metadata, srcDir, false, dataDir); + + const csv = fs.readFileSync(path.join(dataDir, "subject-1_data.csv"), "utf8"); + const header = csv.split(/\r?\n/)[0].split(","); + // No empty column name survives, and the real columns remain. + expect(header).not.toContain(""); + expect(header).toEqual(expect.arrayContaining(["trial_type", "rt"])); + + // The on-disk header and variableMeasured agree (the whole point — a validating dataset). + const names = (metadata.getMetadata()["variableMeasured"] as any[]).map((v) => v.name); + for (const col of header) expect(names).toContain(col); + }); + + test("copies a well-formed CSV byte-for-byte (no unnamed columns to drop)", async () => { + const srcDir = path.join(tmpDir, "src"); + const dataDir = path.join(tmpDir, "data"); + fs.mkdirSync(srcDir); + fs.mkdirSync(dataDir); + const original = "trial_type,rt\njsPsych-html-keyboard-response,450\njsPsych-html-keyboard-response,512"; + fs.writeFileSync(path.join(srcDir, "subject-1_data.csv"), original); + + await processDirectory(new JsPsychMetadata(), srcDir, false, dataDir); + + expect(fs.readFileSync(path.join(dataDir, "subject-1_data.csv"), "utf8")).toBe(original); + }); +}); + +describe("stripUnnamedColumns", () => { + test("removes empty and whitespace-only keys from every row and reports them", () => { + const rows = [ + { "": "1", " ": "x", trial_index: 0, rt: 200 }, + { "": "2", " ": "y", trial_index: 1, rt: 350 }, + ]; + const { rows: out, dropped } = stripUnnamedColumns(rows); + expect(dropped.sort()).toEqual(["", " "]); + expect(out).toBe(rows); // mutates in place, returns same reference + expect(Object.keys(out[0])).toEqual(["trial_index", "rt"]); + }); + + test("is a no-op (empty dropped list) when all columns are named", () => { + const rows = [{ trial_index: 0, rt: 200 }]; + const { dropped } = stripUnnamedColumns(rows); + expect(dropped).toEqual([]); + expect(Object.keys(rows[0])).toEqual(["trial_index", "rt"]); + }); }); diff --git a/packages/metadata/src/index.ts b/packages/metadata/src/index.ts index 68d6ab8..8f505de 100644 --- a/packages/metadata/src/index.ts +++ b/packages/metadata/src/index.ts @@ -1,6 +1,6 @@ import { AuthorFields, AuthorsMap } from "./AuthorsMap"; import { PluginCache } from "./PluginCache"; -import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS } from "./utils"; +import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils"; import { VariableFields, VariablesMap } from "./VariablesMap"; /** @@ -454,6 +454,19 @@ export default class JsPsychMetadata { throw new Error("Parsed data is not in correct format: Expected an array of observations"); } + // Drop unnamed columns (empty/whitespace-only headers) before processing. These can't be + // represented in variableMeasured (Psych-DS requires a name) and are typically R's row-index + // column. Stripping here removes them from variableMeasured and avoids a per-row warning in + // setVariable. The CLI mirrors this on the written CSV so the file and metadata stay in sync. + const { dropped } = stripUnnamedColumns(parsed_data as Array>); + if (dropped.length > 0) { + console.warn( + `Dropped ${dropped.length} unnamed column${dropped.length > 1 ? "s" : ""} from the data — ` + + `Psych-DS requires every column to have a name (usually a row-index column added by R's ` + + `write.csv). Excluded from variableMeasured.` + ); + } + // Callers that already surface join-key uniqueness to the user (e.g. the CLI's // interactive pre-analysis prompt) can suppress this warning to avoid repeating it // once per file. @@ -1019,5 +1032,5 @@ export { AuthorFields, VariableFields } -export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename } from "./utils"; +export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, stripUnnamedColumns } from "./utils"; export type { JoinKeyAnalysis } from "./utils"; diff --git a/packages/metadata/src/utils.ts b/packages/metadata/src/utils.ts index 0f269fc..4ca9022 100644 --- a/packages/metadata/src/utils.ts +++ b/packages/metadata/src/utils.ts @@ -250,6 +250,33 @@ export function objectsToCSV(rows: Array>, priorityCols: str return lines.join('\r\n'); } +/** + * Removes columns whose name is empty or whitespace-only from every row, in place, + * and reports which names were dropped. R's `write.csv` (with the default + * `row.names = TRUE`) prepends an unnamed row-index column, which surfaces as an + * empty-string ("") header. Such a column can never be represented in a Psych-DS + * `variableMeasured` entry (a name is required), so leaving it in produces a dataset + * that fails validation with CSV_COLUMN_MISSING_FROM_METADATA. Dropping it up front — + * once, rather than warning per row — keeps the generated metadata and the written + * CSV consistent. Returns the same `rows` reference for convenient chaining. + */ +export function stripUnnamedColumns( + rows: Array> +): { rows: Array>; dropped: string[] } { + const unnamed = new Set(); + for (const row of rows) { + for (const key of Object.keys(row)) { + if (key.trim() === "") unnamed.add(key); + } + } + if (unnamed.size > 0) { + for (const row of rows) { + for (const key of unnamed) delete row[key]; + } + } + return { rows, dropped: [...unnamed] }; +} + /** * Returns a filename not already present in `used`. If `base` is free it is * returned as-is; otherwise a counter is appended before the `_data.csv` diff --git a/packages/metadata/tests/metadata-module.test.ts b/packages/metadata/tests/metadata-module.test.ts index 6551a23..f6dc397 100644 --- a/packages/metadata/tests/metadata-module.test.ts +++ b/packages/metadata/tests/metadata-module.test.ts @@ -207,6 +207,72 @@ describe("variableMeasured completeness for CSV input", () => { }); }); +describe("unnamed columns are dropped (#109 finding 2)", () => { + let jsPsychMetadata: JsPsychMetadata; + let warnSpy: jest.SpyInstance; + + beforeEach(() => { + jsPsychMetadata = new JsPsychMetadata(); + warnSpy = jest.spyOn(console, "warn").mockImplementation(() => {}); + }); + + afterEach(() => { + warnSpy.mockRestore(); + }); + + // R's write.csv(row.names=TRUE) prepends an unnamed row-index column, so the header + // starts with a bare comma -> an empty-string ("") column name. + test("an empty-header (row-index) column is excluded from variableMeasured", async () => { + const csv = [ + ",trial_type,rt,response", + "1,jsPsych-html-keyboard-response,450,f", + "2,jsPsych-html-keyboard-response,512,j", + "3,jsPsych-html-keyboard-response,389,f", + ].join("\n"); + + await jsPsychMetadata.generate(csv, {}, "csv"); + + const variableMeasured = jsPsychMetadata.getMetadata()["variableMeasured"] as any[]; + const outputNames = variableMeasured.map((v) => v.name); + + expect(outputNames).not.toContain(""); + // The real columns are still present. + expect(outputNames).toEqual(expect.arrayContaining(["trial_type", "rt", "response"])); + }); + + test("warns once, not once per row, when an unnamed column is dropped", async () => { + const csv = [ + ",trial_type,rt", + "1,jsPsych-html-keyboard-response,450", + "2,jsPsych-html-keyboard-response,512", + "3,jsPsych-html-keyboard-response,389", + "4,jsPsych-html-keyboard-response,401", + ].join("\n"); + + await jsPsychMetadata.generate(csv, {}, "csv"); + + const dropWarnings = warnSpy.mock.calls.filter((args) => + String(args[0]).includes("unnamed column") + ); + expect(dropWarnings).toHaveLength(1); + }); + + test("well-formed CSV with no unnamed columns does not warn", async () => { + const csv = [ + "trial_type,rt", + "jsPsych-html-keyboard-response,450", + "jsPsych-html-keyboard-response,512", + ].join("\n"); + + await jsPsychMetadata.generate(csv, {}, "csv"); + + const dropWarnings = warnSpy.mock.calls.filter((args) => + String(args[0]).includes("unnamed column") + ); + expect(dropWarnings).toHaveLength(0); + }); +}); + describe("JsPsychMetadata field operations", () => { let jsPsychMetadata: JsPsychMetadata;