jspsych · jodeleeuw · Jun 18, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 18, 2026
diff --git a/.changeset/more-stress-tests.md b/.changeset/more-stress-tests.md
@@ -0,0 +1,12 @@
+---
+"@jspsych/metadata": patch
+"@jspsych/metadata-cli": patch
+---
+
+Extend the stress-test regression guards with three more Jest suites covering the CSV ingestion path, generation at scale, and cross-file output-name collisions.
+
+- `@jspsych/metadata` — `csv-input.stress`: pins how `generate(data, {}, "csv")` re-infers types from string cells (numeric coercion incl. whitespace/scientific-notation/`Infinity`/`NaN` rejection, mixed-column downgrade, `"true"`/`"false"` staying categorical, RFC-4180 quoting, unicode, empty/literal-`null` cells, the 50-char level cap, JSON-in-a-cell extraction), and asserts CSV/JSON parity for unambiguously-typed columns.
+- `@jspsych/metadata` — `scale.stress`: feeds a 5,000-row dataset and checks exact numeric extremes, categorical dedup, high-cardinality level accumulation, boolean handling, and a throughput ceiling that guards against accidental O(n²) regressions.
+- `@jspsych/metadata-cli` — `array-collision.stress`: two same-stem files in different subdirectories sharing a nested array column, asserting `processDirectory` disambiguates every main CSV, sidecar, and preserved raw original (no overwrites, all still Psych-DS compliant) — the cross-file collision gap left by the earlier rename suite.
+
+Test-only change; no library or CLI behavior is modified.
diff --git a/packages/cli/tests/array-collision.stress.test.ts b/packages/cli/tests/array-collision.stress.test.ts
@@ -0,0 +1,131 @@
+import fs from "fs";
+import os from "os";
+import path from "path";
+import JsPsychMetadata from "@jspsych/metadata";
+import {
+  isValidPsychDSDataFilename,
+} from "@jspsych/metadata";
+import { processDirectory } from "../src/data";
+
+/**
+ * Stress regression guard for cross-file output-name collisions — the coverage gap left by the
+ * original rename suite. Two source files in different subdirectories share the same stem
+ * ("subject-001") AND the same nested array column ("mouse"), so without the run-wide
+ * disambiguation sets every one of {main CSV, preserved raw JSON, array sidecar} would collide and
+ * silently overwrite its twin. This asserts that processDirectory threads `usedArrayFilenames` /
+ * `usedRawFilenames` across files: every output lands under a distinct, still-Psych-DS-compliant
+ * name, nothing is overwritten, and the union of CSV columns still round-trips against
+ * variableMeasured.
+ */
+
+// Minimal RFC-4180 header parser (handles quoted fields containing commas).
+function parseHeader(line: string): string[] {
+  const cols: string[] = [];
+  let cur = "", inQ = false;
+  for (let i = 0; i < line.length; i++) {
+    const c = line[i];
+    if (inQ) {
+      if (c === '"' && line[i + 1] === '"') { cur += '"'; i++; }
+      else if (c === '"') inQ = false;
+      else cur += c;
+    } else if (c === '"') inQ = true;
+    else if (c === ",") { cols.push(cur); cur = ""; }
+    else cur += c;
+  }
+  cols.push(cur);
+  return cols;
+}
+
+// One source file's worth of trials, each with a nested array-of-objects "mouse" column that
+// becomes its own sidecar CSV. `seed` keeps the two files' values distinct so an accidental
+// overwrite would be detectable, not masked by identical content.
+function makeTrials(seed: number) {
+  return [
+    { trial_type: "html-keyboard-response", trial_index: 0, time_elapsed: 100, rt: 100 + seed, mouse: [{ x: seed, y: 1 }, { x: seed + 1, y: 2 }] },
+    { trial_type: "html-keyboard-response", trial_index: 1, time_elapsed: 200, rt: 200 + seed, mouse: [{ x: seed + 2, y: 3 }] },
+  ];
+}
+
+describe("cross-file output-name collision (stress)", () => {
+  let projectDir: string;
+  let dataDir: string;
+  let total: number;
+  let failed: number;
+  let csvs: string[];
+  let rawFiles: string[];
+
+  beforeAll(async () => {
+    jest.spyOn(console, "warn").mockImplementation(() => {});
+    jest.spyOn(console, "log").mockImplementation(() => {});
+
+    projectDir = fs.mkdtempSync(path.join(os.tmpdir(), "stress-collision-"));
+    const inputDir = path.join(projectDir, "input");
+    dataDir = path.join(projectDir, "data");
+    fs.mkdirSync(path.join(inputDir, "a"), { recursive: true });
+    fs.mkdirSync(path.join(inputDir, "b"), { recursive: true });
+    fs.mkdirSync(dataDir, { recursive: true });
+
+    // Same filename, same nested column, different subdirectory -> guaranteed three-way collision.
+    fs.writeFileSync(path.join(inputDir, "a", "subject-001.json"), JSON.stringify(makeTrials(0)));
+    fs.writeFileSync(path.join(inputDir, "b", "subject-001.json"), JSON.stringify(makeTrials(10)));
+
+    const metadata = new JsPsychMetadata();
+    metadata.setMetadataField("name", "collision-stress");
+    ({ total, failed } = await processDirectory(metadata, inputDir, false, dataDir));
+    fs.writeFileSync(
+      path.join(projectDir, "dataset_description.json"),
+      JSON.stringify(metadata.getMetadata(), null, 2),
+    );
+    csvs = fs.readdirSync(dataDir).filter((f) => f.endsWith(".csv"));
+    rawFiles = fs.existsSync(path.join(dataDir, "raw")) ? fs.readdirSync(path.join(dataDir, "raw")) : [];
+  }, 120_000);
+
+  afterAll(() => {
+    jest.restoreAllMocks();
+    fs.rmSync(projectDir, { recursive: true, force: true });
+  });
+
+  test("processes both files with no failures", () => {
+    expect(total).toBe(2);
+    expect(failed).toBe(0);
+  });
+
+  test("writes two distinct main CSVs instead of overwriting one", () => {
+    const mains = csvs.filter((f) => !f.includes("measure-")).sort();
+    expect(mains).toEqual(["subject-0012_data.csv", "subject-001_data.csv"]);
+  });
+
+  test("writes two distinct mouse sidecars instead of overwriting one", () => {
+    const sidecars = csvs.filter((f) => f.includes("measure-mouse")).sort();
+    expect(sidecars).toEqual(["subject-001_measure-mouse2_data.csv", "subject-001_measure-mouse_data.csv"]);
+  });
+
+  test("preserves both originals under data/raw/ under distinct names", () => {
+    expect(rawFiles.filter((f) => f.endsWith(".json")).sort()).toEqual(["subject-001.json", "subject-0012.json"]);
+  });
+
+  test("every written CSV name is unique and Psych-DS compliant", () => {
+    expect(new Set(csvs).size).toBe(csvs.length); // no two outputs share a name
+    expect(csvs.length).toBe(4); // 2 mains + 2 sidecars
+    for (const name of csvs) expect(isValidPsychDSDataFilename(name)).toBe(true);
+  });
+
+  test("no original's content was clobbered (each raw file matches one of the two inputs)", () => {
+    const contents = rawFiles
+      .filter((f) => f.endsWith(".json"))
+      .map((f) => fs.readFileSync(path.join(dataDir, "raw", f), "utf8"));
+    expect(contents).toEqual(expect.arrayContaining([JSON.stringify(makeTrials(0)), JSON.stringify(makeTrials(10))]));
+  });
+
+  test("every variableMeasured name is a column across the written CSVs", () => {
+    const allColumns = new Set<string>();
+    for (const csv of csvs) {
+      const firstLine = fs.readFileSync(path.join(dataDir, csv), "utf8").split(/\r?\n/)[0];
+      parseHeader(firstLine).forEach((c) => allColumns.add(c));
+    }
+    const meta = JSON.parse(fs.readFileSync(path.join(projectDir, "dataset_description.json"), "utf8"));
+    const varNames = (meta.variableMeasured ?? []).map((v: any) => (typeof v === "string" ? v : v.name));
+    const missing = varNames.filter((n: string) => !allColumns.has(n));
+    expect(missing).toEqual([]);
+  });
+});
diff --git a/packages/metadata/tests/csv-input.stress.test.ts b/packages/metadata/tests/csv-input.stress.test.ts
@@ -0,0 +1,183 @@
+import JsPsychMetadata from "../src/index";
+
+/**
+ * Stress regression guard for the CSV ingestion path (generate(data, {}, "csv")).
+ *
+ * Where nested-generation.stress.test.ts feeds richly-typed JSON, this suite feeds CSV — where
+ * every cell arrives as a *string* — and pins how generateObservation re-infers types from those
+ * strings: numeric coercion (incl. whitespace, scientific notation, Infinity/NaN rejection),
+ * mixed-column downgrade, "true"/"false" staying categorical (post-#90), RFC-4180 quoting
+ * (embedded commas / quotes / newlines), unicode, empty / literal-"null" cells, the 50-char level
+ * cap, and JSON-in-a-cell extraction. A final case asserts CSV and the equivalent JSON agree on
+ * type for the columns where they should.
+ */
+
+// Plugin descriptions come from unpkg; stub fetch so the suite is offline-deterministic. Nothing
+// asserted here (types / levels / ranges) depends on the human-readable descriptions.
+const mockFetch = jest.fn().mockResolvedValue({ ok: false, status: 404 });
+
+/** Minimal RFC-4180 serializer: quote a field iff it contains a comma, quote, CR or LF. */
+function toCSV(headers: string[], rows: Record<string, string>[]): string {
+  const enc = (v: string) => (/[",\r\n]/.test(v) ? `"${v.replace(/"/g, '""')}"` : v);
+  const lines = [headers.join(",")];
+  for (const row of rows) lines.push(headers.map((h) => enc(row[h] ?? "")).join(","));
+  return lines.join("\n");
+}
+
+const LONG = "x".repeat(80); // > MAX_LENGTH (50) so it must be truncated to first-50 + "..."
+
+// Three observations. Every row carries trial_type so no column is dropped by the trial_type-less
+// behavior pinned in nested-generation.stress.test.ts (findings F1a/F1b).
+const HEADERS = [
+  "trial_type", "trial_index",
+  "int_col", "float_col", "ws_num", "sci_num", "neg_num",
+  "inf_col", "nan_col", "bool_str", "mixed_col",
+  "quoted_comma", "quoted_newline", "quoted_quote", "unicode_col",
+  "empty_col", "null_word_col", "long_level_col",
+  "json_obj_col", "json_arr_col",
+];
+const ROWS: Record<string, string>[] = [
+  {
+    trial_type: "html-keyboard-response", trial_index: "0",
+    int_col: "42", float_col: "1.5", ws_num: "  10  ", sci_num: "1e3", neg_num: "-5",
+    inf_col: "Infinity", nan_col: "NaN", bool_str: "TRUE", mixed_col: "10",
+    quoted_comma: "a,b", quoted_newline: "line1\nline2", quoted_quote: 'say "hi"', unicode_col: "café",
+    empty_col: "", null_word_col: "null", long_level_col: LONG,
+    json_obj_col: '{"a": 1, "b": "x"}', json_arr_col: "[1, 2, 3]",
+  },
+  {
+    trial_type: "html-keyboard-response", trial_index: "1",
+    int_col: "7", float_col: "2.25", ws_num: "  20  ", sci_num: "2e3", neg_num: "-1",
+    inf_col: "Infinity", nan_col: "NaN", bool_str: "FALSE", mixed_col: "oops",
+    quoted_comma: "c,d", quoted_newline: "x\ny", quoted_quote: 'a""b', unicode_col: "日本語",
+    empty_col: "", null_word_col: "null", long_level_col: "short",
+    json_obj_col: '{"a": 9, "b": "y"}', json_arr_col: "[4, 5]",
+  },
+  {
+    trial_type: "html-keyboard-response", trial_index: "2",
+    int_col: "100", float_col: "0.5", ws_num: "  30  ", sci_num: "1.5e3", neg_num: "-10",
+    inf_col: "Infinity", nan_col: "NaN", bool_str: "true", mixed_col: "3",
+    quoted_comma: "e,f", quoted_newline: "p\nq", quoted_quote: 'plain', unicode_col: "emoji👋",
+    empty_col: "", null_word_col: "null", long_level_col: "short",
+    json_obj_col: '{"a": 50, "b": "z"}', json_arr_col: "[6]",
+  },
+];
+
+describe("CSV ingestion type-inference (stress)", () => {
+  let vars: Map<string, any>;
+  let metadata: JsPsychMetadata;
+
+  beforeAll(async () => {
+    (global as any).fetch = mockFetch;
+    jest.spyOn(console, "warn").mockImplementation(() => {});
+    metadata = new JsPsychMetadata();
+    await metadata.generate(toCSV(HEADERS, ROWS), {}, "csv");
+    vars = new Map(metadata.getMetadata().variableMeasured.map((v: any) => [v.name, v]));
+  });
+
+  afterAll(() => jest.restoreAllMocks());
+
+  test("coerces integers, floats, scientific notation and negatives to numeric ranges", () => {
+    expect(vars.get("int_col")).toMatchObject({ value: "number", minValue: 7, maxValue: 100 });
+    expect(vars.get("float_col")).toMatchObject({ value: "number", minValue: 0.5, maxValue: 2.25 });
+    expect(vars.get("sci_num")).toMatchObject({ value: "number", minValue: 1000, maxValue: 2000 });
+    expect(vars.get("neg_num")).toMatchObject({ value: "number", minValue: -10, maxValue: -1 });
+    // No numeric column should carry levels.
+    for (const n of ["int_col", "float_col", "sci_num", "neg_num"]) expect(vars.get(n).levels).toBeUndefined();
+  });
+
+  test("trims surrounding whitespace before the numeric test (Number(' 10 ') === 10)", () => {
+    expect(vars.get("ws_num")).toMatchObject({ value: "number", minValue: 10, maxValue: 30 });
+  });
+
+  test("rejects Infinity / NaN as non-numeric and keeps them as string levels", () => {
+    // Number.isFinite (not !isNaN) is the gate, so these never leak into a numeric range.
+    expect(vars.get("inf_col").value).toBe("string");
+    expect(vars.get("inf_col").minValue).toBeUndefined();
+    expect(vars.get("inf_col").levels).toEqual(["Infinity"]);
+    expect(vars.get("nan_col").value).toBe("string");
+    expect(vars.get("nan_col").levels).toEqual(["NaN"]);
+  });
+
+  test('keeps "true"/"false" strings categorical (only genuine JSON booleans are boolean)', () => {
+    const v = vars.get("bool_str");
+    expect(v.value).toBe("string");
+    expect(v.levels).toEqual(expect.arrayContaining(["TRUE", "FALSE", "true"]));
+    expect(v.minValue).toBeUndefined();
+  });
+
+  test("downgrades a numeric-then-string column to categorical, preserving the numeric boundary as a level", () => {
+    const v = vars.get("mixed_col");
+    expect(v.value).toBe("string");
+    expect(v.minValue).toBeUndefined();
+    // "10" seen first (numeric boundary), then "oops", then "3".
+    expect(v.levels).toEqual(["10", "oops", "3"]);
+  });
+
+  test("parses RFC-4180 quoted fields (embedded comma, quote, newline) without corruption", () => {
+    expect(vars.get("quoted_comma").levels).toEqual(["a,b", "c,d", "e,f"]);
+    expect(vars.get("quoted_newline").levels).toEqual(["line1\nline2", "x\ny", "p\nq"]);
+    expect(vars.get("quoted_quote").levels).toEqual(['say "hi"', 'a""b', "plain"]);
+  });
+
+  test("preserves unicode in level strings", () => {
+    expect(vars.get("unicode_col").levels).toEqual(["café", "日本語", "emoji👋"]);
+  });
+
+  test('treats empty cells and the literal string "null" as no-value (column stays "unknown")', () => {
+    for (const n of ["empty_col", "null_word_col"]) {
+      const v = vars.get(n);
+      expect(v.value).toBe("unknown");
+      expect(v.levels).toBeUndefined();
+    }
+  });
+
+  test("caps an over-long level at 50 chars + ellipsis", () => {
+    const v = vars.get("long_level_col");
+    const truncated = "x".repeat(50) + "...";
+    expect(v.levels).toEqual(expect.arrayContaining([truncated, "short"]));
+    expect(v.levels).not.toContain(LONG); // the full 80-char string is never stored
+  });
+
+  test("parses a JSON object / array embedded in a CSV cell and extracts its sub-columns", () => {
+    expect(vars.get("json_obj_col").value).toBe("object");
+    expect(vars.get("json_obj_col.a")).toMatchObject({ value: "number", minValue: 1, maxValue: 50 });
+    expect(vars.get("json_obj_col.b").value).toBe("string");
+    expect(vars.get("json_arr_col").value).toBe("array");
+    const arrays = metadata.getExtractedArrays();
+    expect(arrays.has("json_arr_col")).toBe(true);
+  });
+});
+
+describe("CSV / JSON parity for unambiguously-typed columns (stress)", () => {
+  // Booleans and nulls intentionally differ between the two formats (a CSV "true" is a string
+  // level; a JSON true is a boolean), so this parity check is restricted to numeric and plain
+  // string columns, where CSV coercion must reproduce exactly what native JSON typing produces.
+  const headers = ["trial_type", "trial_index", "num", "word"];
+  const rows = [
+    { trial_type: "t", trial_index: "0", num: "5", word: "alpha" },
+    { trial_type: "t", trial_index: "1", num: "9", word: "beta" },
+    { trial_type: "t", trial_index: "2", num: "1", word: "alpha" },
+  ];
+
+  test("CSV and the equivalent JSON yield identical type/range/levels for num & word", async () => {
+    (global as any).fetch = mockFetch;
+    jest.spyOn(console, "warn").mockImplementation(() => {});
+
+    const fromCsv = new JsPsychMetadata();
+    await fromCsv.generate(toCSV(headers, rows), {}, "csv");
+
+    const json = rows.map((r) => ({ ...r, trial_index: Number(r.trial_index), num: Number(r.num) }));
+    const fromJson = new JsPsychMetadata();
+    await fromJson.generate(JSON.stringify(json), {}, "json");
+
+    const pick = (m: JsPsychMetadata, name: string) => {
+      const v = m.getMetadata().variableMeasured.find((x: any) => x.name === name);
+      return { value: v.value, minValue: v.minValue, maxValue: v.maxValue, levels: v.levels };
+    };
+    expect(pick(fromCsv, "num")).toEqual(pick(fromJson, "num"));
+    expect(pick(fromCsv, "word")).toEqual(pick(fromJson, "word"));
+
+    jest.restoreAllMocks();
+  });
+});