Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changeset/drop-unnamed-columns.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"@jspsych/metadata": patch
"@jspsych/metadata-cli": patch
---

Drop unnamed columns instead of producing a dataset that can't validate. R's `write.csv` (with the default `row.names = TRUE`) prepends an unnamed row-index column, so the exported CSV header starts with a bare comma — an empty-string column name. Psych-DS variables require a name, so the column could never appear in `variableMeasured`; the library previously skipped it (logging `Name field is missing. Variable not added.` once per row) while the CLI copied the CSV verbatim, leaving a column in the file with no metadata entry and failing validation with `CSV_COLUMN_MISSING_FROM_METADATA`.

`generate()` now strips empty/whitespace-only columns from the parsed data up front, with a single warning instead of per-row spam, via a new exported `stripUnnamedColumns` helper. The CLI mirrors this when writing data files: a `.csv` input that contains unnamed columns is re-serialised without them (column order preserved) so the on-disk file matches `variableMeasured`; well-formed CSVs are still copied byte-for-byte. Fixes finding #2 of #109.
25 changes: 21 additions & 4 deletions packages/cli/src/data.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fs from "fs";
import path from "path";
import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, deriveArrayFilename, disambiguateArrayFilename, objectsToCSV, isValidPsychDSDataFilename } from "@jspsych/metadata";
import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, deriveArrayFilename, disambiguateArrayFilename, objectsToCSV, isValidPsychDSDataFilename, stripUnnamedColumns } from "@jspsych/metadata";
import { expandHomeDir, disambiguateFilename, fileStem } from "./utils";
import { PlannedFile } from "./rename";

Expand Down Expand Up @@ -331,12 +331,29 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil
}
await fs.promises.writeFile(path.join(rawDir, rawName), content);

// Mirror generate()'s unnamed-column drop so the converted CSV matches variableMeasured.
stripUnnamedColumns(parsed);
await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), objectsToCSV(parsed, ['trial_index']));
if (verbose) console.log(` → converted ${file} to ${mainName} (raw saved to raw/${rawName})`);
} else {
// .csv input is already CSV; write it out under its Psych-DS-compliant name.
await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), content);
if (verbose) console.log(` → wrote ${file} as ${mainName}`);
// .csv input is already CSV. R-style exports (write.csv with row.names=TRUE) prepend an
// unnamed row-index column whose empty header can't be represented in variableMeasured;
// generate() drops it from the metadata, so the written CSV must drop it too or the dataset
// fails validation (CSV_COLUMN_MISSING_FROM_METADATA). When unnamed columns are present we
// re-serialise the cleaned rows (preserving column order); otherwise we copy verbatim so
// well-formed files are written byte-for-byte unchanged.
const csvRows = (await parseCSV(content)) as Array<Record<string, any>>;
const { rows: cleanedRows, dropped } = stripUnnamedColumns(csvRows);
if (dropped.length > 0) {
console.log(
` ! dropped ${dropped.length} unnamed column${dropped.length > 1 ? "s" : ""} from "${file}" ` +
`(row-index artifact); writing cleaned CSV as ${mainName}.`
);
await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), objectsToCSV(cleanedRows, []));
} else {
await fs.promises.writeFile(path.join(targetDirectoryPath, mainName), content);
if (verbose) console.log(` → wrote ${file} as ${mainName}`);
}
}

// Sidecar CSVs: one per array-of-objects column and one per plain-object column
Expand Down
63 changes: 62 additions & 1 deletion packages/cli/tests/data.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import fs from "fs";
import os from "os";
import path from "path";
import JsPsychMetadata from "@jspsych/metadata";
import JsPsychMetadata, { stripUnnamedColumns } from "@jspsych/metadata";
import { processOptions, loadMetadata, saveTextToPath, processDirectory, preAnalyzeDirectory, enumerateDataFiles, analyzeOutputColumns, RenamePlanError } from "../src/data";
import { planRenames } from "../src/rename";

Expand Down Expand Up @@ -491,4 +491,65 @@ describe("processDirectory JSON → CSV conversion", () => {
processDirectory(new JsPsychMetadata(), srcDir, false, dataDir, { normalizedBases, renamePlan })
).rejects.toThrow(RenamePlanError);
});

// #109 finding 2: R's write.csv prepends an unnamed row-index column (header starts with a
// bare comma). generate() drops it from variableMeasured, so the written CSV must drop it too
// or the dataset fails validation with CSV_COLUMN_MISSING_FROM_METADATA.
test("drops an unnamed leading column from the written CSV so it matches variableMeasured", async () => {
const srcDir = path.join(tmpDir, "src");
const dataDir = path.join(tmpDir, "data");
fs.mkdirSync(srcDir);
fs.mkdirSync(dataDir);
// Leading bare comma => empty-string header, exactly as R's write.csv(row.names=TRUE) emits.
fs.writeFileSync(
path.join(srcDir, "subject-1_data.csv"),
",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512",
);

const metadata = new JsPsychMetadata();
await processDirectory(metadata, srcDir, false, dataDir);

const csv = fs.readFileSync(path.join(dataDir, "subject-1_data.csv"), "utf8");
const header = csv.split(/\r?\n/)[0].split(",");
// No empty column name survives, and the real columns remain.
expect(header).not.toContain("");
expect(header).toEqual(expect.arrayContaining(["trial_type", "rt"]));

// The on-disk header and variableMeasured agree (the whole point — a validating dataset).
const names = (metadata.getMetadata()["variableMeasured"] as any[]).map((v) => v.name);
for (const col of header) expect(names).toContain(col);
});

test("copies a well-formed CSV byte-for-byte (no unnamed columns to drop)", async () => {
const srcDir = path.join(tmpDir, "src");
const dataDir = path.join(tmpDir, "data");
fs.mkdirSync(srcDir);
fs.mkdirSync(dataDir);
const original = "trial_type,rt\njsPsych-html-keyboard-response,450\njsPsych-html-keyboard-response,512";
fs.writeFileSync(path.join(srcDir, "subject-1_data.csv"), original);

await processDirectory(new JsPsychMetadata(), srcDir, false, dataDir);

expect(fs.readFileSync(path.join(dataDir, "subject-1_data.csv"), "utf8")).toBe(original);
});
});

describe("stripUnnamedColumns", () => {
test("removes empty and whitespace-only keys from every row and reports them", () => {
const rows = [
{ "": "1", " ": "x", trial_index: 0, rt: 200 },
{ "": "2", " ": "y", trial_index: 1, rt: 350 },
];
const { rows: out, dropped } = stripUnnamedColumns(rows);
expect(dropped.sort()).toEqual(["", " "]);
expect(out).toBe(rows); // mutates in place, returns same reference
expect(Object.keys(out[0])).toEqual(["trial_index", "rt"]);
});

test("is a no-op (empty dropped list) when all columns are named", () => {
const rows = [{ trial_index: 0, rt: 200 }];
const { dropped } = stripUnnamedColumns(rows);
expect(dropped).toEqual([]);
expect(Object.keys(rows[0])).toEqual(["trial_index", "rt"]);
});
});
17 changes: 15 additions & 2 deletions packages/metadata/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { AuthorFields, AuthorsMap } from "./AuthorsMap";
import { PluginCache } from "./PluginCache";
import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS } from "./utils";
import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils";
import { VariableFields, VariablesMap } from "./VariablesMap";

/**
Expand Down Expand Up @@ -454,6 +454,19 @@ export default class JsPsychMetadata {
throw new Error("Parsed data is not in correct format: Expected an array of observations");
}

// Drop unnamed columns (empty/whitespace-only headers) before processing. These can't be
// represented in variableMeasured (Psych-DS requires a name) and are typically R's row-index
// column. Stripping here removes them from variableMeasured and avoids a per-row warning in
// setVariable. The CLI mirrors this on the written CSV so the file and metadata stay in sync.
const { dropped } = stripUnnamedColumns(parsed_data as Array<Record<string, any>>);
if (dropped.length > 0) {
console.warn(
`Dropped ${dropped.length} unnamed column${dropped.length > 1 ? "s" : ""} from the data — ` +
`Psych-DS requires every column to have a name (usually a row-index column added by R's ` +
`write.csv). Excluded from variableMeasured.`
);
}

// Callers that already surface join-key uniqueness to the user (e.g. the CLI's
// interactive pre-analysis prompt) can suppress this warning to avoid repeating it
// once per file.
Expand Down Expand Up @@ -1019,5 +1032,5 @@ export {
AuthorFields,
VariableFields
}
export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename } from "./utils";
export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, stripUnnamedColumns } from "./utils";
export type { JoinKeyAnalysis } from "./utils";
27 changes: 27 additions & 0 deletions packages/metadata/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,33 @@ export function objectsToCSV(rows: Array<Record<string, any>>, priorityCols: str
return lines.join('\r\n');
}

/**
* Removes columns whose name is empty or whitespace-only from every row, in place,
* and reports which names were dropped. R's `write.csv` (with the default
* `row.names = TRUE`) prepends an unnamed row-index column, which surfaces as an
* empty-string ("") header. Such a column can never be represented in a Psych-DS
* `variableMeasured` entry (a name is required), so leaving it in produces a dataset
* that fails validation with CSV_COLUMN_MISSING_FROM_METADATA. Dropping it up front —
* once, rather than warning per row — keeps the generated metadata and the written
* CSV consistent. Returns the same `rows` reference for convenient chaining.
*/
export function stripUnnamedColumns(
rows: Array<Record<string, any>>
): { rows: Array<Record<string, any>>; dropped: string[] } {
const unnamed = new Set<string>();
for (const row of rows) {
for (const key of Object.keys(row)) {
if (key.trim() === "") unnamed.add(key);
}
}
if (unnamed.size > 0) {
for (const row of rows) {
for (const key of unnamed) delete row[key];
}
}
return { rows, dropped: [...unnamed] };
}

/**
* Returns a filename not already present in `used`. If `base` is free it is
* returned as-is; otherwise a counter is appended before the `_data.csv`
Expand Down
66 changes: 66 additions & 0 deletions packages/metadata/tests/metadata-module.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,72 @@ describe("variableMeasured completeness for CSV input", () => {
});
});

describe("unnamed columns are dropped (#109 finding 2)", () => {
let jsPsychMetadata: JsPsychMetadata;
let warnSpy: jest.SpyInstance;

beforeEach(() => {
jsPsychMetadata = new JsPsychMetadata();
warnSpy = jest.spyOn(console, "warn").mockImplementation(() => {});
});

afterEach(() => {
warnSpy.mockRestore();
});

// R's write.csv(row.names=TRUE) prepends an unnamed row-index column, so the header
// starts with a bare comma -> an empty-string ("") column name.
test("an empty-header (row-index) column is excluded from variableMeasured", async () => {
const csv = [
",trial_type,rt,response",
"1,jsPsych-html-keyboard-response,450,f",
"2,jsPsych-html-keyboard-response,512,j",
"3,jsPsych-html-keyboard-response,389,f",
].join("\n");

await jsPsychMetadata.generate(csv, {}, "csv");

const variableMeasured = jsPsychMetadata.getMetadata()["variableMeasured"] as any[];
const outputNames = variableMeasured.map((v) => v.name);

expect(outputNames).not.toContain("");
// The real columns are still present.
expect(outputNames).toEqual(expect.arrayContaining(["trial_type", "rt", "response"]));
});

test("warns once, not once per row, when an unnamed column is dropped", async () => {
const csv = [
",trial_type,rt",
"1,jsPsych-html-keyboard-response,450",
"2,jsPsych-html-keyboard-response,512",
"3,jsPsych-html-keyboard-response,389",
"4,jsPsych-html-keyboard-response,401",
].join("\n");

await jsPsychMetadata.generate(csv, {}, "csv");

const dropWarnings = warnSpy.mock.calls.filter((args) =>
String(args[0]).includes("unnamed column")
);
expect(dropWarnings).toHaveLength(1);
});

test("well-formed CSV with no unnamed columns does not warn", async () => {
const csv = [
"trial_type,rt",
"jsPsych-html-keyboard-response,450",
"jsPsych-html-keyboard-response,512",
].join("\n");

await jsPsychMetadata.generate(csv, {}, "csv");

const dropWarnings = warnSpy.mock.calls.filter((args) =>
String(args[0]).includes("unnamed column")
);
expect(dropWarnings).toHaveLength(0);
});
});

describe("JsPsychMetadata field operations", () => {
let jsPsychMetadata: JsPsychMetadata;

Expand Down
Loading