Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .changeset/drop-unnamed-columns.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
"@jspsych/metadata": patch
"@jspsych/metadata-cli": patch
"frontend": patch
---

Drop unnamed columns so R-exported datasets validate. R's `write.csv` (with the default `row.names = TRUE`) prepends an unnamed row-index column, so the exported CSV header starts with a bare comma — an empty-string column name. Psych-DS variables require a name, so the column can never appear in `variableMeasured`; left in the on-disk CSV it fails validation with `CSV_COLUMN_MISSING_FROM_METADATA`.

The strip now lives in the shared data-file path so the CLI and frontend behave identically:

- `generate()` strips empty/whitespace-only columns from the parsed data up front, with a single warning instead of per-row spam (keeps `variableMeasured` clean and standalone library use safe), via a new exported `stripUnnamedColumns` helper.
- `buildPsychDSDataFiles` strips the main table before emitting it: a clean CSV keeps its exact bytes (verbatim `mainContent`), while a file with an unnamed column is re-serialised from the cleaned rows. Both the CLI (rename-plan and non-plan paths) and the frontend feed parsed `mainRows`, so the written/zipped/validated CSV always matches the metadata.

Fixes finding #2 of #109.
14 changes: 11 additions & 3 deletions packages/cli/src/data.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fs from "fs";
import path from "path";
import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata";
import JsPsychMetadata, { analyzeJoinKeys, JoinKeyAnalysis, parseCSV, objectsToCSV, isValidPsychDSDataFilename, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "@jspsych/metadata";
import { expandHomeDir, disambiguateFilename, fileStem } from "./utils";
import { PlannedFile } from "./rename";

Expand Down Expand Up @@ -350,6 +350,11 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil
parsed = json;
}

// Rows for the main table, fed to the shared builder / inline write. JSON is already
// parsed above; parse CSV here too so unnamed row-index columns can be stripped (a CSV's
// exact bytes are still preserved via mainContent when nothing is dropped).
const mainRows: Array<Record<string, any>> = parsed ?? ((await parseCSV(content)) as Array<Record<string, any>>);

// Resolve the Psych-DS base (keyword-value sequence before "_data.csv"). The index.ts
// pre-pass supplies a base for every source file; fall back to the file's own stem when
// called directly (e.g. in tests). Never invent a keyword here — if the resolved base
Expand Down Expand Up @@ -436,9 +441,12 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil
};

const mainName = reserve(planned.mainName, 'main output');
// Drop R-style unnamed row-index columns (same as the shared builder / generate()). A clean
// CSV keeps its exact bytes; JSON or a dirty CSV is serialised from the cleaned rows.
const { rows: cleanedMain, dropped: droppedMain } = stripUnnamedColumns(mainRows);
await fs.promises.writeFile(
path.join(targetDirectoryPath, mainName),
parsed ? objectsToCSV(parsed, ['trial_index']) : content,
(parsed === null && droppedMain.length === 0) ? content : objectsToCSV(cleanedMain, ['trial_index']),
);
if (verbose) console.log(` → wrote ${file} as ${mainName}`);

Expand All @@ -458,7 +466,7 @@ const processFile = async (metadata: JsPsychMetadata, directoryPath: string, fil
// disambiguation, and CSV building — the same implementation the browser flow uses.
const built = buildPsychDSDataFiles({
base,
mainRows: parsed ?? [],
mainRows,
mainContent: parsed ? undefined : content,
extractedArrays,
extractedObjects,
Expand Down
46 changes: 46 additions & 0 deletions packages/cli/tests/data.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,52 @@ describe("processDirectory JSON → CSV conversion", () => {
processDirectory(new JsPsychMetadata(), srcDir, false, dataDir, { normalizedBases, renamePlan })
).rejects.toThrow(RenamePlanError);
});

// #109 finding 2: R's write.csv prepends an unnamed row-index column (header starts with a bare
// comma). The written CSV must drop it so it matches variableMeasured, on both write paths.
test("non-planned path drops an unnamed leading column from the written CSV", async () => {
const srcDir = path.join(tmpDir, "src");
const dataDir = path.join(tmpDir, "data");
fs.mkdirSync(srcDir);
fs.mkdirSync(dataDir);
fs.writeFileSync(
path.join(srcDir, "subject-1_data.csv"),
",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512",
);

const metadata = new JsPsychMetadata();
await processDirectory(metadata, srcDir, false, dataDir);

const csv = fs.readFileSync(path.join(dataDir, "subject-1_data.csv"), "utf8");
const header = csv.split(/\r?\n/)[0].split(",");
expect(header).not.toContain("");
expect(header).toEqual(expect.arrayContaining(["trial_type", "rt"]));
// On-disk header and variableMeasured agree — the whole point.
const names = (metadata.getMetadata()["variableMeasured"] as any[]).map((v) => v.name);
for (const col of header) expect(names).toContain(col);
});

test("rename-plan path also drops an unnamed leading column", async () => {
const srcDir = path.join(tmpDir, "src");
const dataDir = path.join(tmpDir, "data");
fs.mkdirSync(srcDir);
fs.mkdirSync(dataDir);
fs.writeFileSync(
path.join(srcDir, "raw.csv"),
",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512",
);

const key = path.resolve(srcDir, "raw.csv");
const columns = await analyzeOutputColumns(srcDir);
const normalizedBases = new Map([[key, "subject-9"]]);
const renamePlan = planRenames(columns.map((c) => ({ key: c.key, base: "subject-9", arrayColumns: c.arrayColumns, objectColumns: c.objectColumns })));

const { failed } = await processDirectory(new JsPsychMetadata(), srcDir, false, dataDir, { normalizedBases, renamePlan });

expect(failed).toBe(0);
const csv = fs.readFileSync(path.join(dataDir, "subject-9_data.csv"), "utf8");
expect(csv.split(/\r?\n/)[0].split(",")).not.toContain("");
});
});

// #109 finding #3: a fully-flagged headless run must not block on the interactive join-key prompt.
Expand Down
5 changes: 4 additions & 1 deletion packages/frontend/src/pages/DataUpload.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { useState, useRef, useEffect } from 'react';
import JSZip from 'jszip';
import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata';
import JsPsychMetadata, { analyzeJoinKeys, deriveFallbackBase, buildPsychDSDataFiles, isValidPsychDSDataFilename, parseCSV, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from '@jspsych/metadata';
import PageHeader from '../components/PageHeader';
import styles from './DataUpload.module.css';

Expand Down Expand Up @@ -276,6 +276,9 @@ const DataUpload: React.FC<DataUploadProps> = ({
mainRows = json;
} else {
mainContent = content;
// Parse CSV rows too so the builder can drop R-style unnamed row-index columns; a clean
// CSV still keeps its exact bytes (mainContent is used verbatim when nothing is dropped).
mainRows = (await parseCSV(content)) as Array<Record<string, unknown>>;
}

const built = buildPsychDSDataFiles({
Expand Down
30 changes: 30 additions & 0 deletions packages/frontend/tests/dataUploadConversion.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { parseCSV, buildPsychDSDataFiles, deriveFallbackBase } from "@jspsych/metadata";

// Mirrors the CSV branch of DataUpload.runGenerate: parse the uploaded CSV into mainRows and
// hand it to the shared builder. Guards the frontend wiring (the parseCSV call + builder usage)
// that lets an R-style export — with an unnamed row-index column — produce a clean converted
// data/*.csv, without rendering the stateful upload component.
describe("frontend CSV → Psych-DS conversion (the runGenerate path)", () => {
it("drops an unnamed row-index column from the converted main CSV", async () => {
const content = ",trial_type,rt\n1,jsPsych-html-keyboard-response,450\n2,jsPsych-html-keyboard-response,512";
const mainRows = (await parseCSV(content)) as Array<Record<string, unknown>>;

const built = buildPsychDSDataFiles({
base: deriveFallbackBase("sub01"),
mainRows,
mainContent: content,
});

const main = built.find((f) => f.kind === "main")!;
const header = main.content.split(/\r?\n/)[0].split(",");
expect(header).not.toContain("");
expect(header).toEqual(["trial_type", "rt"]);
});

it("keeps a well-formed CSV's bytes verbatim", async () => {
const content = "trial_type,rt\njsPsych-html-keyboard-response,450";
const mainRows = (await parseCSV(content)) as Array<Record<string, unknown>>;
const built = buildPsychDSDataFiles({ base: deriveFallbackBase("sub01"), mainRows, mainContent: content });
expect(built.find((f) => f.kind === "main")!.content).toBe(content);
});
});
17 changes: 15 additions & 2 deletions packages/metadata/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { AuthorFields, AuthorsMap } from "./AuthorsMap";
import { PluginCache } from "./PluginCache";
import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS } from "./utils";
import { saveTextToFile, parseCSV, tryParseJSON, analyzeJoinKeys, JoinKeyAnalysis, SYSTEM_COLUMNS, stripUnnamedColumns } from "./utils";
import { VariableFields, VariablesMap } from "./VariablesMap";

/**
Expand Down Expand Up @@ -446,6 +446,19 @@ export default class JsPsychMetadata {
throw new Error("Parsed data is not in correct format: Expected an array of observations");
}

// Drop unnamed columns (empty/whitespace-only headers) before processing. These can't be
// represented in variableMeasured (Psych-DS requires a name) and are typically R's row-index
// column. Stripping here removes them from variableMeasured and avoids a per-row warning in
// setVariable. buildPsychDSDataFiles mirrors this on the written CSV so file and metadata sync.
const { dropped } = stripUnnamedColumns(parsed_data as Array<Record<string, any>>);
if (dropped.length > 0) {
console.warn(
`Dropped ${dropped.length} unnamed column${dropped.length > 1 ? "s" : ""} from the data — ` +
`Psych-DS requires every column to have a name (usually a row-index column added by R's ` +
`write.csv). Excluded from variableMeasured.`
);
}

// Callers that already surface join-key uniqueness to the user (e.g. the CLI's
// interactive pre-analysis prompt) can suppress this warning to avoid repeating it
// once per file.
Expand Down Expand Up @@ -1034,5 +1047,5 @@ export {
AuthorFields,
VariableFields
}
export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils";
export { analyzeJoinKeys, parseCSV, isValidPsychDSDataFilename, toPsychDSValue, deriveArrayFilename, objectsToCSV, disambiguateArrayFilename, deriveFallbackBase, buildPsychDSDataFiles, stripUnnamedColumns, PSYCHDS_IGNORE_FILENAME, PSYCHDS_IGNORE_CONTENT } from "./utils";
export type { JoinKeyAnalysis, PsychDSDataFile, BuildPsychDSDataFilesArgs } from "./utils";
48 changes: 43 additions & 5 deletions packages/metadata/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,33 @@ export function disambiguateArrayFilename(base: string, used: Set<string>): stri
return candidate;
}

/**
* Removes columns whose name is empty or whitespace-only from every row, in place,
* and reports which names were dropped. R's `write.csv` (with the default
* `row.names = TRUE`) prepends an unnamed row-index column, which surfaces as an
* empty-string ("") header. Such a column can never be represented in a Psych-DS
* `variableMeasured` entry (a name is required), so leaving it in produces a dataset
* that fails validation with CSV_COLUMN_MISSING_FROM_METADATA. Dropping it up front —
* once, rather than warning per row — keeps the generated metadata and the written
* CSV consistent. Returns the same `rows` reference for convenient chaining.
*/
export function stripUnnamedColumns(
rows: Array<Record<string, any>>
): { rows: Array<Record<string, any>>; dropped: string[] } {
const unnamed = new Set<string>();
for (const row of rows) {
for (const key of Object.keys(row)) {
if (key.trim() === "") unnamed.add(key);
}
}
if (unnamed.size > 0) {
for (const row of rows) {
for (const key of unnamed) delete row[key];
}
}
return { rows, dropped: [...unnamed] };
}

/** A single converted Psych-DS output file produced by {@link buildPsychDSDataFiles}. */
export interface PsychDSDataFile {
/** Psych-DS-compliant filename, relative to the `data/` directory. */
Expand All @@ -313,12 +340,17 @@ export interface PsychDSDataFile {
export interface BuildPsychDSDataFilesArgs {
/** Compliant filename base (keyword-value sequence before `_data.csv`), e.g. "id-sub01". */
base: string;
/** Parsed rows of the main data file. Serialised to CSV unless `mainContent` is given. */
/**
* Parsed rows of the main data file. Serialised to CSV unless `mainContent` is given and
* no unnamed columns are dropped. Always supply this (parse CSV inputs too) so unnamed
* row-index columns can be detected and stripped.
*/
mainRows: Array<Record<string, any>>;
/**
* Pre-rendered CSV for the main file, used verbatim instead of serialising `mainRows`.
* Pass this when the source is already CSV so its exact bytes (column order, quoting) are
* preserved; `mainRows` may be empty in that case.
* Pre-rendered CSV for the main file, used verbatim instead of serialising `mainRows` —
* but only when no unnamed columns are dropped. Pass this when the source is already CSV
* so a clean file keeps its exact bytes (column order, quoting); a file with an unnamed
* column is re-serialised from the cleaned `mainRows` instead.
*/
mainContent?: string;
/** Array-column rows keyed by column name (from `JsPsychMetadata.getExtractedArrays`). */
Expand Down Expand Up @@ -369,9 +401,15 @@ export function buildPsychDSDataFiles(args: BuildPsychDSDataFilesArgs): PsychDSD

// Main table. Disambiguate up front so a later file sharing this base doesn't overwrite it.
const mainName = reserve(disambiguateArrayFilename(`${base}_data.csv`, usedArrayFilenames));
// Drop unnamed columns (R's row-index artifact) so the written CSV matches variableMeasured,
// which generate() also strips. A clean CSV input keeps its exact bytes (mainContent verbatim);
// a dirty one is re-serialised from the cleaned rows, as is JSON (no mainContent given).
const { rows: cleanedMainRows, dropped: droppedMain } = stripUnnamedColumns(mainRows);
out.push({
filename: mainName,
content: mainContent ?? objectsToCSV(mainRows, ['trial_index']),
content: (mainContent !== undefined && droppedMain.length === 0)
? mainContent
: objectsToCSV(cleanedMainRows, ['trial_index']),
kind: 'main',
});

Expand Down
65 changes: 65 additions & 0 deletions packages/metadata/tests/metadata-module.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -806,3 +806,68 @@ describe("boolean vs string true/false levels", () => {
}
});
});

describe("unnamed columns are dropped (#109 finding 2)", () => {
let jsPsychMetadata: JsPsychMetadata;
let warnSpy: jest.SpyInstance;

beforeEach(() => {
jsPsychMetadata = new JsPsychMetadata();
warnSpy = jest.spyOn(console, "warn").mockImplementation(() => {});
});

afterEach(() => {
warnSpy.mockRestore();
});

// R's write.csv(row.names=TRUE) prepends an unnamed row-index column, so the header
// starts with a bare comma -> an empty-string ("") column name.
test("an empty-header (row-index) column is excluded from variableMeasured", async () => {
const csv = [
",trial_type,rt,response",
"1,jsPsych-html-keyboard-response,450,f",
"2,jsPsych-html-keyboard-response,512,j",
"3,jsPsych-html-keyboard-response,389,f",
].join("\n");

await jsPsychMetadata.generate(csv, {}, "csv");

const variableMeasured = jsPsychMetadata.getMetadata()["variableMeasured"] as any[];
const outputNames = variableMeasured.map((v) => v.name);

expect(outputNames).not.toContain("");
expect(outputNames).toEqual(expect.arrayContaining(["trial_type", "rt", "response"]));
});

test("warns once, not once per row, when an unnamed column is dropped", async () => {
const csv = [
",trial_type,rt",
"1,jsPsych-html-keyboard-response,450",
"2,jsPsych-html-keyboard-response,512",
"3,jsPsych-html-keyboard-response,389",
"4,jsPsych-html-keyboard-response,401",
].join("\n");

await jsPsychMetadata.generate(csv, {}, "csv");

const dropWarnings = warnSpy.mock.calls.filter((args) =>
String(args[0]).includes("unnamed column")
);
expect(dropWarnings).toHaveLength(1);
});

test("well-formed CSV with no unnamed columns does not warn", async () => {
const csv = [
"trial_type,rt",
"jsPsych-html-keyboard-response,450",
"jsPsych-html-keyboard-response,512",
].join("\n");

await jsPsychMetadata.generate(csv, {}, "csv");

const dropWarnings = warnSpy.mock.calls.filter((args) =>
String(args[0]).includes("unnamed column")
);
expect(dropWarnings).toHaveLength(0);
});
});
Loading
Loading