Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changeset/more-stress-tests.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
"@jspsych/metadata": patch
"@jspsych/metadata-cli": patch
---

Extend the stress-test regression guards with three more Jest suites covering the CSV ingestion path, generation at scale, and cross-file output-name collisions.

- `@jspsych/metadata` — `csv-input.stress`: pins how `generate(data, {}, "csv")` re-infers types from string cells (numeric coercion incl. whitespace/scientific-notation/`Infinity`/`NaN` rejection, mixed-column downgrade, `"true"`/`"false"` staying categorical, RFC-4180 quoting, unicode, empty/literal-`null` cells, the 50-char level cap, JSON-in-a-cell extraction), and asserts CSV/JSON parity for unambiguously-typed columns.
- `@jspsych/metadata` — `scale.stress`: feeds a 5,000-row dataset and checks exact numeric extremes, categorical dedup, high-cardinality level accumulation, boolean handling, and a throughput ceiling that guards against accidental O(n²) regressions.
- `@jspsych/metadata-cli` — `array-collision.stress`: two same-stem files in different subdirectories sharing a nested array column, asserting `processDirectory` disambiguates every main CSV, sidecar, and preserved raw original (no overwrites, all still Psych-DS compliant) — the cross-file collision gap left by the earlier rename suite.

Test-only change; no library or CLI behavior is modified.
131 changes: 131 additions & 0 deletions packages/cli/tests/array-collision.stress.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import fs from "fs";
import os from "os";
import path from "path";
import JsPsychMetadata from "@jspsych/metadata";
import {
isValidPsychDSDataFilename,
} from "@jspsych/metadata";
import { processDirectory } from "../src/data";

/**
* Stress regression guard for cross-file output-name collisions — the coverage gap left by the
* original rename suite. Two source files in different subdirectories share the same stem
* ("subject-001") AND the same nested array column ("mouse"), so without the run-wide
* disambiguation sets every one of {main CSV, preserved raw JSON, array sidecar} would collide and
* silently overwrite its twin. This asserts that processDirectory threads `usedArrayFilenames` /
* `usedRawFilenames` across files: every output lands under a distinct, still-Psych-DS-compliant
* name, nothing is overwritten, and the union of CSV columns still round-trips against
* variableMeasured.
*/

// Minimal RFC-4180 header parser (handles quoted fields containing commas).
function parseHeader(line: string): string[] {
const cols: string[] = [];
let cur = "", inQ = false;
for (let i = 0; i < line.length; i++) {
const c = line[i];
if (inQ) {
if (c === '"' && line[i + 1] === '"') { cur += '"'; i++; }
else if (c === '"') inQ = false;
else cur += c;
} else if (c === '"') inQ = true;
else if (c === ",") { cols.push(cur); cur = ""; }
else cur += c;
}
cols.push(cur);
return cols;
}

// One source file's worth of trials, each with a nested array-of-objects "mouse" column that
// becomes its own sidecar CSV. `seed` keeps the two files' values distinct so an accidental
// overwrite would be detectable, not masked by identical content.
function makeTrials(seed: number) {
return [
{ trial_type: "html-keyboard-response", trial_index: 0, time_elapsed: 100, rt: 100 + seed, mouse: [{ x: seed, y: 1 }, { x: seed + 1, y: 2 }] },
{ trial_type: "html-keyboard-response", trial_index: 1, time_elapsed: 200, rt: 200 + seed, mouse: [{ x: seed + 2, y: 3 }] },
];
}

describe("cross-file output-name collision (stress)", () => {
let projectDir: string;
let dataDir: string;
let total: number;
let failed: number;
let csvs: string[];
let rawFiles: string[];

beforeAll(async () => {
jest.spyOn(console, "warn").mockImplementation(() => {});
jest.spyOn(console, "log").mockImplementation(() => {});

projectDir = fs.mkdtempSync(path.join(os.tmpdir(), "stress-collision-"));
const inputDir = path.join(projectDir, "input");
dataDir = path.join(projectDir, "data");
fs.mkdirSync(path.join(inputDir, "a"), { recursive: true });
fs.mkdirSync(path.join(inputDir, "b"), { recursive: true });
fs.mkdirSync(dataDir, { recursive: true });

// Same filename, same nested column, different subdirectory -> guaranteed three-way collision.
fs.writeFileSync(path.join(inputDir, "a", "subject-001.json"), JSON.stringify(makeTrials(0)));
fs.writeFileSync(path.join(inputDir, "b", "subject-001.json"), JSON.stringify(makeTrials(10)));

const metadata = new JsPsychMetadata();
metadata.setMetadataField("name", "collision-stress");
({ total, failed } = await processDirectory(metadata, inputDir, false, dataDir));
fs.writeFileSync(
path.join(projectDir, "dataset_description.json"),
JSON.stringify(metadata.getMetadata(), null, 2),
);
csvs = fs.readdirSync(dataDir).filter((f) => f.endsWith(".csv"));
rawFiles = fs.existsSync(path.join(dataDir, "raw")) ? fs.readdirSync(path.join(dataDir, "raw")) : [];
}, 120_000);

afterAll(() => {
jest.restoreAllMocks();
fs.rmSync(projectDir, { recursive: true, force: true });
});

test("processes both files with no failures", () => {
expect(total).toBe(2);
expect(failed).toBe(0);
});

test("writes two distinct main CSVs instead of overwriting one", () => {
const mains = csvs.filter((f) => !f.includes("measure-")).sort();
expect(mains).toEqual(["subject-0012_data.csv", "subject-001_data.csv"]);
});

test("writes two distinct mouse sidecars instead of overwriting one", () => {
const sidecars = csvs.filter((f) => f.includes("measure-mouse")).sort();
expect(sidecars).toEqual(["subject-001_measure-mouse2_data.csv", "subject-001_measure-mouse_data.csv"]);
});

test("preserves both originals under data/raw/ under distinct names", () => {
expect(rawFiles.filter((f) => f.endsWith(".json")).sort()).toEqual(["subject-001.json", "subject-0012.json"]);
});

test("every written CSV name is unique and Psych-DS compliant", () => {
expect(new Set(csvs).size).toBe(csvs.length); // no two outputs share a name
expect(csvs.length).toBe(4); // 2 mains + 2 sidecars
for (const name of csvs) expect(isValidPsychDSDataFilename(name)).toBe(true);
});

test("no original's content was clobbered (each raw file matches one of the two inputs)", () => {
const contents = rawFiles
.filter((f) => f.endsWith(".json"))
.map((f) => fs.readFileSync(path.join(dataDir, "raw", f), "utf8"));
expect(contents).toEqual(expect.arrayContaining([JSON.stringify(makeTrials(0)), JSON.stringify(makeTrials(10))]));
});

test("every variableMeasured name is a column across the written CSVs", () => {
const allColumns = new Set<string>();
for (const csv of csvs) {
const firstLine = fs.readFileSync(path.join(dataDir, csv), "utf8").split(/\r?\n/)[0];
parseHeader(firstLine).forEach((c) => allColumns.add(c));
}
const meta = JSON.parse(fs.readFileSync(path.join(projectDir, "dataset_description.json"), "utf8"));
const varNames = (meta.variableMeasured ?? []).map((v: any) => (typeof v === "string" ? v : v.name));
const missing = varNames.filter((n: string) => !allColumns.has(n));
expect(missing).toEqual([]);
});
});
183 changes: 183 additions & 0 deletions packages/metadata/tests/csv-input.stress.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import JsPsychMetadata from "../src/index";

/**
* Stress regression guard for the CSV ingestion path (generate(data, {}, "csv")).
*
* Where nested-generation.stress.test.ts feeds richly-typed JSON, this suite feeds CSV — where
* every cell arrives as a *string* — and pins how generateObservation re-infers types from those
* strings: numeric coercion (incl. whitespace, scientific notation, Infinity/NaN rejection),
* mixed-column downgrade, "true"/"false" staying categorical (post-#90), RFC-4180 quoting
* (embedded commas / quotes / newlines), unicode, empty / literal-"null" cells, the 50-char level
* cap, and JSON-in-a-cell extraction. A final case asserts CSV and the equivalent JSON agree on
* type for the columns where they should.
*/

// Plugin descriptions come from unpkg; stub fetch so the suite is offline-deterministic. Nothing
// asserted here (types / levels / ranges) depends on the human-readable descriptions.
const mockFetch = jest.fn().mockResolvedValue({ ok: false, status: 404 });

/** Minimal RFC-4180 serializer: quote a field iff it contains a comma, quote, CR or LF. */
function toCSV(headers: string[], rows: Record<string, string>[]): string {
const enc = (v: string) => (/[",\r\n]/.test(v) ? `"${v.replace(/"/g, '""')}"` : v);
const lines = [headers.join(",")];
for (const row of rows) lines.push(headers.map((h) => enc(row[h] ?? "")).join(","));
return lines.join("\n");
}

const LONG = "x".repeat(80); // > MAX_LENGTH (50) so it must be truncated to first-50 + "..."

// Three observations. Every row carries trial_type so no column is dropped by the trial_type-less
// behavior pinned in nested-generation.stress.test.ts (findings F1a/F1b).
const HEADERS = [
"trial_type", "trial_index",
"int_col", "float_col", "ws_num", "sci_num", "neg_num",
"inf_col", "nan_col", "bool_str", "mixed_col",
"quoted_comma", "quoted_newline", "quoted_quote", "unicode_col",
"empty_col", "null_word_col", "long_level_col",
"json_obj_col", "json_arr_col",
];
const ROWS: Record<string, string>[] = [
{
trial_type: "html-keyboard-response", trial_index: "0",
int_col: "42", float_col: "1.5", ws_num: " 10 ", sci_num: "1e3", neg_num: "-5",
inf_col: "Infinity", nan_col: "NaN", bool_str: "TRUE", mixed_col: "10",
quoted_comma: "a,b", quoted_newline: "line1\nline2", quoted_quote: 'say "hi"', unicode_col: "café",
empty_col: "", null_word_col: "null", long_level_col: LONG,
json_obj_col: '{"a": 1, "b": "x"}', json_arr_col: "[1, 2, 3]",
},
{
trial_type: "html-keyboard-response", trial_index: "1",
int_col: "7", float_col: "2.25", ws_num: " 20 ", sci_num: "2e3", neg_num: "-1",
inf_col: "Infinity", nan_col: "NaN", bool_str: "FALSE", mixed_col: "oops",
quoted_comma: "c,d", quoted_newline: "x\ny", quoted_quote: 'a""b', unicode_col: "日本語",
empty_col: "", null_word_col: "null", long_level_col: "short",
json_obj_col: '{"a": 9, "b": "y"}', json_arr_col: "[4, 5]",
},
{
trial_type: "html-keyboard-response", trial_index: "2",
int_col: "100", float_col: "0.5", ws_num: " 30 ", sci_num: "1.5e3", neg_num: "-10",
inf_col: "Infinity", nan_col: "NaN", bool_str: "true", mixed_col: "3",
quoted_comma: "e,f", quoted_newline: "p\nq", quoted_quote: 'plain', unicode_col: "emoji👋",
empty_col: "", null_word_col: "null", long_level_col: "short",
json_obj_col: '{"a": 50, "b": "z"}', json_arr_col: "[6]",
},
];

describe("CSV ingestion type-inference (stress)", () => {
let vars: Map<string, any>;
let metadata: JsPsychMetadata;

beforeAll(async () => {
(global as any).fetch = mockFetch;
jest.spyOn(console, "warn").mockImplementation(() => {});
metadata = new JsPsychMetadata();
await metadata.generate(toCSV(HEADERS, ROWS), {}, "csv");
vars = new Map(metadata.getMetadata().variableMeasured.map((v: any) => [v.name, v]));
});

afterAll(() => jest.restoreAllMocks());

test("coerces integers, floats, scientific notation and negatives to numeric ranges", () => {
expect(vars.get("int_col")).toMatchObject({ value: "number", minValue: 7, maxValue: 100 });
expect(vars.get("float_col")).toMatchObject({ value: "number", minValue: 0.5, maxValue: 2.25 });
expect(vars.get("sci_num")).toMatchObject({ value: "number", minValue: 1000, maxValue: 2000 });
expect(vars.get("neg_num")).toMatchObject({ value: "number", minValue: -10, maxValue: -1 });
// No numeric column should carry levels.
for (const n of ["int_col", "float_col", "sci_num", "neg_num"]) expect(vars.get(n).levels).toBeUndefined();
});

test("trims surrounding whitespace before the numeric test (Number(' 10 ') === 10)", () => {
expect(vars.get("ws_num")).toMatchObject({ value: "number", minValue: 10, maxValue: 30 });
});

test("rejects Infinity / NaN as non-numeric and keeps them as string levels", () => {
// Number.isFinite (not !isNaN) is the gate, so these never leak into a numeric range.
expect(vars.get("inf_col").value).toBe("string");
expect(vars.get("inf_col").minValue).toBeUndefined();
expect(vars.get("inf_col").levels).toEqual(["Infinity"]);
expect(vars.get("nan_col").value).toBe("string");
expect(vars.get("nan_col").levels).toEqual(["NaN"]);
});

test('keeps "true"/"false" strings categorical (only genuine JSON booleans are boolean)', () => {
const v = vars.get("bool_str");
expect(v.value).toBe("string");
expect(v.levels).toEqual(expect.arrayContaining(["TRUE", "FALSE", "true"]));
expect(v.minValue).toBeUndefined();
});

test("downgrades a numeric-then-string column to categorical, preserving the numeric boundary as a level", () => {
const v = vars.get("mixed_col");
expect(v.value).toBe("string");
expect(v.minValue).toBeUndefined();
// "10" seen first (numeric boundary), then "oops", then "3".
expect(v.levels).toEqual(["10", "oops", "3"]);
});

test("parses RFC-4180 quoted fields (embedded comma, quote, newline) without corruption", () => {
expect(vars.get("quoted_comma").levels).toEqual(["a,b", "c,d", "e,f"]);
expect(vars.get("quoted_newline").levels).toEqual(["line1\nline2", "x\ny", "p\nq"]);
expect(vars.get("quoted_quote").levels).toEqual(['say "hi"', 'a""b', "plain"]);
});

test("preserves unicode in level strings", () => {
expect(vars.get("unicode_col").levels).toEqual(["café", "日本語", "emoji👋"]);
});

test('treats empty cells and the literal string "null" as no-value (column stays "unknown")', () => {
for (const n of ["empty_col", "null_word_col"]) {
const v = vars.get(n);
expect(v.value).toBe("unknown");
expect(v.levels).toBeUndefined();
}
});

test("caps an over-long level at 50 chars + ellipsis", () => {
const v = vars.get("long_level_col");
const truncated = "x".repeat(50) + "...";
expect(v.levels).toEqual(expect.arrayContaining([truncated, "short"]));
expect(v.levels).not.toContain(LONG); // the full 80-char string is never stored
});

test("parses a JSON object / array embedded in a CSV cell and extracts its sub-columns", () => {
expect(vars.get("json_obj_col").value).toBe("object");
expect(vars.get("json_obj_col.a")).toMatchObject({ value: "number", minValue: 1, maxValue: 50 });
expect(vars.get("json_obj_col.b").value).toBe("string");
expect(vars.get("json_arr_col").value).toBe("array");
const arrays = metadata.getExtractedArrays();
expect(arrays.has("json_arr_col")).toBe(true);
});
});

describe("CSV / JSON parity for unambiguously-typed columns (stress)", () => {
// Booleans and nulls intentionally differ between the two formats (a CSV "true" is a string
// level; a JSON true is a boolean), so this parity check is restricted to numeric and plain
// string columns, where CSV coercion must reproduce exactly what native JSON typing produces.
const headers = ["trial_type", "trial_index", "num", "word"];
const rows = [
{ trial_type: "t", trial_index: "0", num: "5", word: "alpha" },
{ trial_type: "t", trial_index: "1", num: "9", word: "beta" },
{ trial_type: "t", trial_index: "2", num: "1", word: "alpha" },
];

test("CSV and the equivalent JSON yield identical type/range/levels for num & word", async () => {
(global as any).fetch = mockFetch;
jest.spyOn(console, "warn").mockImplementation(() => {});

const fromCsv = new JsPsychMetadata();
await fromCsv.generate(toCSV(headers, rows), {}, "csv");

const json = rows.map((r) => ({ ...r, trial_index: Number(r.trial_index), num: Number(r.num) }));
const fromJson = new JsPsychMetadata();
await fromJson.generate(JSON.stringify(json), {}, "json");

const pick = (m: JsPsychMetadata, name: string) => {
const v = m.getMetadata().variableMeasured.find((x: any) => x.name === name);
return { value: v.value, minValue: v.minValue, maxValue: v.maxValue, levels: v.levels };
};
expect(pick(fromCsv, "num")).toEqual(pick(fromJson, "num"));
expect(pick(fromCsv, "word")).toEqual(pick(fromJson, "word"));

jest.restoreAllMocks();
});
});
Loading
Loading