From 19c34de7207312c965244db7280e3dc709e5666b Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Tue, 9 Jun 2026 11:14:59 -0700
Subject: [PATCH 1/8] feat(studio): add client-side dataset quality checks on
 JSONL upload (ASTD-73)

Validates JSONL files on file select when purpose=dataset, surfacing errors
(UTF-16 encoding, invalid JSON lines, empty file) and warnings (unknown schema,
null/empty fields, entries exceeding ~8192 tokens) inline before upload.
Submit is blocked until errors are resolved.

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 .../common/src/utils/datasetQuality.test.ts   | 220 ++++++++++++++++++
 .../common/src/utils/datasetQuality.ts        | 192 +++++++++++++++
 .../src/routes/FilesetNewRoute/index.spec.tsx | 167 +++++++++++++
 .../src/routes/FilesetNewRoute/index.tsx      | 140 +++++++++--
 4 files changed, 706 insertions(+), 13 deletions(-)
 create mode 100644 web/packages/common/src/utils/datasetQuality.test.ts
 create mode 100644 web/packages/common/src/utils/datasetQuality.ts

diff --git a/web/packages/common/src/utils/datasetQuality.test.ts b/web/packages/common/src/utils/datasetQuality.test.ts
new file mode 100644
index 0000000000..ae0ddb8803
--- /dev/null
+++ b/web/packages/common/src/utils/datasetQuality.test.ts
@@ -0,0 +1,220 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality';
+
+function makeFile(content: string, name = 'train.jsonl', bytes?: Uint8Array): File {
+  const file = new File([content], name, { type: 'application/x-jsonlines' });
+  file.text = vi.fn().mockResolvedValue(content);
+  file.arrayBuffer = vi
+    .fn()
+    .mockResolvedValue(bytes ? bytes.buffer : new TextEncoder().encode(content).buffer);
+  return file;
+}
+
+function utf16File(content: string, le = true): File {
+  const bom = le ? new Uint8Array([0xff, 0xfe]) : new Uint8Array([0xfe, 0xff]);
+  const encoded = new TextEncoder().encode(content);
+  const merged = new Uint8Array(bom.length + encoded.length);
+  merged.set(bom);
+  merged.set(encoded, bom.length);
+  return makeFile(content, 'train.jsonl', merged);
+}
+
+function jsonlLines(rows: object[]): string {
+  return rows.map((r) => JSON.stringify(r)).join('\n');
+}
+
+describe('checkDatasetQuality', () => {
+  describe('encoding', () => {
+    it('returns INVALID_ENCODING error for UTF-16 LE', async () => {
+      const report = await checkDatasetQuality(utf16File('some content', true));
+      expect(report.hasErrors).toBe(true);
+      expect(report.issues[0].code).toBe('INVALID_ENCODING');
+      expect(report.issues[0].severity).toBe('error');
+    });
+
+    it('returns INVALID_ENCODING error for UTF-16 BE', async () => {
+      const report = await checkDatasetQuality(utf16File('some content', false));
+      expect(report.hasErrors).toBe(true);
+      expect(report.issues[0].code).toBe('INVALID_ENCODING');
+    });
+
+    it('does not flag UTF-8 files', async () => {
+      const file = makeFile(
+        jsonlLines([
+          {
+            messages: [
+              { role: 'user', content: 'hi' },
+              { role: 'assistant', content: 'hello' },
+            ],
+          },
+        ])
+      );
+      const report = await checkDatasetQuality(file);
+      expect(report.issues.find((i) => i.code === 'INVALID_ENCODING')).toBeUndefined();
+    });
+  });
+
+  describe('empty file', () => {
+    it('returns EMPTY_FILE error for blank file', async () => {
+      const report = await checkDatasetQuality(makeFile(''));
+      expect(report.hasErrors).toBe(true);
+      expect(report.issues[0].code).toBe('EMPTY_FILE');
+    });
+
+    it('returns EMPTY_FILE error for whitespace-only file', async () => {
+      const report = await checkDatasetQuality(makeFile('   \n\t\n  '));
+      expect(report.hasErrors).toBe(true);
+      expect(report.issues[0].code).toBe('EMPTY_FILE');
+    });
+  });
+
+  describe('JSON parsing', () => {
+    it('returns INVALID_JSON_LINES error for malformed lines', async () => {
+      const content = jsonlLines([{ prompt: 'q', completion: 'a' }]) + '\nnot-json\nalso bad';
+      const report = await checkDatasetQuality(makeFile(content));
+      const issue = report.issues.find((i) => i.code === 'INVALID_JSON_LINES');
+      expect(issue).toBeDefined();
+      expect(issue?.severity).toBe('error');
+      expect(issue?.count).toBe(2);
+      expect(issue?.affectedLines).toContain(2);
+      expect(issue?.affectedLines).toContain(3);
+    });
+
+    it('flags JSON arrays and scalars as invalid lines (not JSON objects)', async () => {
+      const content = '["not", "an", "object"]\n42';
+      const report = await checkDatasetQuality(makeFile(content));
+      const issue = report.issues.find((i) => i.code === 'INVALID_JSON_LINES');
+      expect(issue?.count).toBe(2);
+    });
+
+    it('has no parse error for valid JSONL', async () => {
+      const content = jsonlLines([
+        { prompt: 'q', completion: 'a' },
+        { prompt: 'q2', completion: 'a2' },
+      ]);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.issues.find((i) => i.code === 'INVALID_JSON_LINES')).toBeUndefined();
+    });
+  });
+
+  describe('schema detection', () => {
+    it('no warning for messages schema', async () => {
+      const row = {
+        messages: [
+          { role: 'user', content: 'hi' },
+          { role: 'assistant', content: 'hello' },
+        ],
+      };
+      const report = await checkDatasetQuality(makeFile(jsonlLines([row])));
+      expect(report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA')).toBeUndefined();
+    });
+
+    it('no warning for prompt/completion schema', async () => {
+      const report = await checkDatasetQuality(
+        makeFile(jsonlLines([{ prompt: 'q', completion: 'a' }]))
+      );
+      expect(report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA')).toBeUndefined();
+    });
+
+    it('no warning for question/ideal_response schema', async () => {
+      const report = await checkDatasetQuality(
+        makeFile(jsonlLines([{ question: 'q', ideal_response: 'a' }]))
+      );
+      expect(report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA')).toBeUndefined();
+    });
+
+    it('returns UNKNOWN_SCHEMA warning for unrecognized fields', async () => {
+      // None of these keys match known schema patterns (no messages, prompt, completion, etc.)
+      const report = await checkDatasetQuality(
+        makeFile(jsonlLines([{ topic: 'foo', category: 'bar', label: 1 }]))
+      );
+      const issue = report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA');
+      expect(issue).toBeDefined();
+      expect(issue?.severity).toBe('warning');
+    });
+  });
+
+  describe('null and empty fields', () => {
+    it('returns NULL_OR_EMPTY_FIELDS warning for null values', async () => {
+      const content = jsonlLines([
+        { prompt: 'q', completion: null },
+        { prompt: 'q2', completion: 'a2' },
+      ]);
+      const report = await checkDatasetQuality(makeFile(content));
+      const issue = report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS');
+      expect(issue).toBeDefined();
+      expect(issue?.count).toBe(1);
+      expect(issue?.affectedLines).toContain(1);
+    });
+
+    it('returns NULL_OR_EMPTY_FIELDS warning for empty string values', async () => {
+      const content = jsonlLines([{ prompt: '', completion: 'a' }]);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS')).toBeDefined();
+    });
+
+    it('returns NULL_OR_EMPTY_FIELDS warning for empty arrays', async () => {
+      const content = jsonlLines([{ messages: [] }]);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS')).toBeDefined();
+    });
+
+    it('no warning when all fields are populated', async () => {
+      const content = jsonlLines([{ prompt: 'q', completion: 'a' }]);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS')).toBeUndefined();
+    });
+  });
+
+  describe('long entries', () => {
+    it('returns LONG_ENTRIES warning when a line exceeds 32768 chars', async () => {
+      const longValue = 'x'.repeat(33_000);
+      const content = jsonlLines([{ prompt: longValue, completion: 'a' }]);
+      const report = await checkDatasetQuality(makeFile(content));
+      const issue = report.issues.find((i) => i.code === 'LONG_ENTRIES');
+      expect(issue).toBeDefined();
+      expect(issue?.severity).toBe('warning');
+      expect(issue?.affectedLines).toContain(1);
+    });
+
+    it('no warning for normal-length lines', async () => {
+      const content = jsonlLines([{ prompt: 'short question', completion: 'short answer' }]);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.issues.find((i) => i.code === 'LONG_ENTRIES')).toBeUndefined();
+    });
+  });
+
+  describe('line scanning limit', () => {
+    it('scans only first 1000 lines for large files', async () => {
+      const rows = Array.from({ length: 1500 }, (_, i) => ({
+        prompt: `q${i}`,
+        completion: `a${i}`,
+      }));
+      const content = jsonlLines(rows);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.totalLines).toBe(1500);
+      expect(report.scannedLines).toBe(1000);
+    });
+  });
+
+  describe('report structure', () => {
+    it('sets hasErrors and hasWarnings correctly', async () => {
+      const content = jsonlLines([{ prompt: 'q', completion: 'a' }]);
+      const report = await checkDatasetQuality(makeFile(content));
+      expect(report.hasErrors).toBe(false);
+      expect(report.hasWarnings).toBe(false);
+      expect(report.fileName).toBe('train.jsonl');
+    });
+
+    it('caps affectedLines at 10 entries', async () => {
+      const rows = Array.from({ length: 15 }, () => ({ prompt: '', completion: '' }));
+      const content = jsonlLines(rows);
+      const report = await checkDatasetQuality(makeFile(content));
+      const issue = report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS');
+      expect(issue?.affectedLines?.length).toBeLessThanOrEqual(10);
+      expect(issue?.count).toBe(15);
+    });
+  });
+});
diff --git a/web/packages/common/src/utils/datasetQuality.ts b/web/packages/common/src/utils/datasetQuality.ts
new file mode 100644
index 0000000000..7531d3440a
--- /dev/null
+++ b/web/packages/common/src/utils/datasetQuality.ts
@@ -0,0 +1,192 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { findMessagesArray } from '@nemo/common/src/utils/file';
+
+export type DatasetQualityCode =
+  | 'EMPTY_FILE'
+  | 'INVALID_ENCODING'
+  | 'INVALID_JSON_LINES'
+  | 'UNKNOWN_SCHEMA'
+  | 'NULL_OR_EMPTY_FIELDS'
+  | 'LONG_ENTRIES';
+
+export interface DatasetQualityIssue {
+  severity: 'error' | 'warning';
+  code: DatasetQualityCode;
+  message: string;
+  /** 1-based line numbers affected (first 10 only) */
+  affectedLines?: number[];
+  /** Total count of affected lines */
+  count?: number;
+}
+
+export interface DatasetQualityReport {
+  fileName: string;
+  hasErrors: boolean;
+  hasWarnings: boolean;
+  issues: DatasetQualityIssue[];
+  /** Number of lines actually scanned (may be less than totalLines for large files) */
+  scannedLines: number;
+  totalLines: number;
+}
+
+const MAX_SCAN_LINES = 1000;
+
+/** ~8192 tokens at ~4 chars/token */
+const LONG_ENTRY_CHAR_THRESHOLD = 32_768;
+
+const PROMPT_KEYS = ['prompt', 'question'];
+const COMPLETION_KEYS = ['completion', 'ideal_response', 'response', 'output', 'answer'];
+
+function plural(n: number, word: string): string {
+  return `${n} ${word}${n === 1 ? '' : 's'}`;
+}
+
+/**
+ * Runs dataset quality checks on a JSONL file and returns a structured report.
+ * Errors indicate the file should not be uploaded as-is; warnings are advisory.
+ *
+ * Checks performed:
+ * - UTF-16 BOM detection (error)
+ * - Empty file (error)
+ * - Invalid JSON on any line (error)
+ * - Unknown fine-tuning schema — no messages or prompt/completion fields (warning)
+ * - Null or empty field values (warning)
+ * - Lines exceeding estimated context window (~8192 tokens) (warning)
+ *
+ * For files with more than 1000 lines, only the first 1000 are scanned.
+ */
+export async function checkDatasetQuality(file: File): Promise<DatasetQualityReport> {
+  const issues: DatasetQualityIssue[] = [];
+
+  // 1. Encoding — detect UTF-16 via BOM before reading as text
+  const buffer = await file.arrayBuffer();
+  const bytes = new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
+  const isUtf16Le = bytes[0] === 0xff && bytes[1] === 0xfe;
+  const isUtf16Be = bytes[0] === 0xfe && bytes[1] === 0xff;
+  if (isUtf16Le || isUtf16Be) {
+    issues.push({
+      severity: 'error',
+      code: 'INVALID_ENCODING',
+      message: 'File is UTF-16 encoded. Re-save as UTF-8 before uploading.',
+    });
+    return {
+      fileName: file.name,
+      hasErrors: true,
+      hasWarnings: false,
+      issues,
+      scannedLines: 0,
+      totalLines: 0,
+    };
+  }
+
+  // 2. Read text and split into non-empty lines
+  const text = await file.text();
+  const allLines = text.split('\n').filter((l) => l.trim().length > 0);
+  const totalLines = allLines.length;
+
+  if (totalLines === 0) {
+    issues.push({
+      severity: 'error',
+      code: 'EMPTY_FILE',
+      message: 'File is empty or contains only whitespace.',
+    });
+    return {
+      fileName: file.name,
+      hasErrors: true,
+      hasWarnings: false,
+      issues,
+      scannedLines: 0,
+      totalLines: 0,
+    };
+  }
+
+  const scanLines = allLines.slice(0, MAX_SCAN_LINES);
+  const scannedLines = scanLines.length;
+
+  // 3. Parse each line — collect invalid and valid rows separately
+  const invalidLineNums: number[] = [];
+  const parsedRows: Array<{ lineNum: number; row: Record<string, unknown> }> = [];
+
+  for (let i = 0; i < scanLines.length; i++) {
+    try {
+      const parsed: unknown = JSON.parse(scanLines[i]);
+      if (parsed !== null && typeof parsed === 'object' && !Array.isArray(parsed)) {
+        parsedRows.push({ lineNum: i + 1, row: parsed as Record<string, unknown> });
+      } else {
+        // Valid JSON but not an object (array, scalar, null) — not a valid JSONL dataset row
+        invalidLineNums.push(i + 1);
+      }
+    } catch {
+      invalidLineNums.push(i + 1);
+    }
+  }
+
+  if (invalidLineNums.length > 0) {
+    issues.push({
+      severity: 'error',
+      code: 'INVALID_JSON_LINES',
+      message: `${plural(invalidLineNums.length, 'line')} could not be parsed as JSON objects.`,
+      affectedLines: invalidLineNums.slice(0, 10),
+      count: invalidLineNums.length,
+    });
+  }
+
+  if (parsedRows.length > 0) {
+    // 4. Schema detection on the first valid row
+    const firstRow = parsedRows[0].row;
+    const hasMessagesSchema = findMessagesArray(firstRow) !== null;
+    const hasPromptCompletionSchema =
+      PROMPT_KEYS.some((k) => k in firstRow) || COMPLETION_KEYS.some((k) => k in firstRow);
+
+    if (!hasMessagesSchema && !hasPromptCompletionSchema) {
+      issues.push({
+        severity: 'warning',
+        code: 'UNKNOWN_SCHEMA',
+        message:
+          'No recognized fine-tuning schema detected. Expected a messages array or prompt/completion fields.',
+      });
+    }
+
+    // 5. Null or empty field values across all valid rows
+    const nullFieldLines: number[] = [];
+    for (const { lineNum, row } of parsedRows) {
+      const hasNullOrEmpty = Object.values(row).some(
+        (v) => v === null || v === '' || (Array.isArray(v) && v.length === 0)
+      );
+      if (hasNullOrEmpty) nullFieldLines.push(lineNum);
+    }
+    if (nullFieldLines.length > 0) {
+      issues.push({
+        severity: 'warning',
+        code: 'NULL_OR_EMPTY_FIELDS',
+        message: `${plural(nullFieldLines.length, 'row')} contain null or empty field values.`,
+        affectedLines: nullFieldLines.slice(0, 10),
+        count: nullFieldLines.length,
+      });
+    }
+
+    // 6. Long entries — rough token estimate via character count
+    const longLines: number[] = [];
+    for (let i = 0; i < scanLines.length; i++) {
+      if (scanLines[i].length > LONG_ENTRY_CHAR_THRESHOLD) {
+        longLines.push(i + 1);
+      }
+    }
+    if (longLines.length > 0) {
+      issues.push({
+        severity: 'warning',
+        code: 'LONG_ENTRIES',
+        message: `${plural(longLines.length, 'row')} may exceed the model's context window (~8,192 tokens).`,
+        affectedLines: longLines.slice(0, 10),
+        count: longLines.length,
+      });
+    }
+  }
+
+  const hasErrors = issues.some((i) => i.severity === 'error');
+  const hasWarnings = issues.some((i) => i.severity === 'warning');
+
+  return { fileName: file.name, hasErrors, hasWarnings, issues, scannedLines, totalLines };
+}
diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx
index 961b780481..75872df048 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx
@@ -1,7 +1,27 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+import type { DatasetQualityReport } from '@nemo/common/src/utils/datasetQuality';
 import { ROUTE_PARAMS } from '@studio/constants/routes';
+
+vi.mock('@nemo/common/src/utils/datasetQuality', () => ({
+  checkDatasetQuality: vi.fn(),
+}));
+
+import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality';
+const mockCheckDatasetQuality = vi.mocked(checkDatasetQuality);
+
+function makeQualityReport(overrides: Partial<DatasetQualityReport> = {}): DatasetQualityReport {
+  return {
+    fileName: 'train.jsonl',
+    hasErrors: false,
+    hasWarnings: false,
+    issues: [],
+    scannedLines: 10,
+    totalLines: 10,
+    ...overrides,
+  };
+}
 import { workspace1 } from '@studio/mocks/entity-store/projects';
 import { FilesetNewRoute } from '@studio/routes/FilesetNewRoute';
 import { mockUseNavigate, mockUseParams } from '@studio/tests/util/mockUseParams';
@@ -179,4 +199,151 @@ describe('FilesetNewRoute', () => {
 
     expect(screen.queryByText(/must start with a lowercase letter/i)).not.toBeInTheDocument();
   });
+
+  describe('dataset quality validation', () => {
+    beforeEach(() => {
+      mockCheckDatasetQuality.mockReset();
+    });
+
+    function makeJsonlFile(name = 'train.jsonl'): File {
+      return new File(['{"prompt":"q","completion":"a"}'], name, {
+        type: 'application/x-jsonlines',
+      });
+    }
+
+    async function uploadFile(user: ReturnType<typeof userEvent.setup>, file: File) {
+      const input = document.querySelector('input[type="file"]') as HTMLInputElement;
+      await user.upload(input, file);
+    }
+
+    it('shows quality report after uploading a JSONL file when purpose is Dataset', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({ hasErrors: false, hasWarnings: false })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+
+      expect(await screen.findByText(/all quality checks passed/i)).toBeInTheDocument();
+    });
+
+    it('shows error issues from the quality report', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({
+          hasErrors: true,
+          issues: [
+            {
+              severity: 'error',
+              code: 'INVALID_JSON_LINES',
+              message: '2 lines could not be parsed as JSON objects.',
+              affectedLines: [3, 7],
+              count: 2,
+            },
+          ],
+        })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+
+      expect(
+        await screen.findByText(/2 lines could not be parsed as JSON objects/i)
+      ).toBeInTheDocument();
+    });
+
+    it('shows warning issues from the quality report', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({
+          hasWarnings: true,
+          issues: [
+            {
+              severity: 'warning',
+              code: 'UNKNOWN_SCHEMA',
+              message: 'No recognized fine-tuning schema detected.',
+            },
+          ],
+        })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+
+      expect(await screen.findByText(/No recognized fine-tuning schema detected/i)).toBeInTheDocument();
+    });
+
+    it('disables the Create Fileset button when quality report has errors', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({
+          hasErrors: true,
+          issues: [{ severity: 'error', code: 'EMPTY_FILE', message: 'File is empty.' }],
+        })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+
+      await screen.findByText(/File is empty/i);
+      expect(await screen.findByRole('button', { name: 'Create Fileset' })).toBeDisabled();
+    });
+
+    it('does not disable submit for warning-only reports', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({
+          hasWarnings: true,
+          issues: [{ severity: 'warning', code: 'LONG_ENTRIES', message: '1 row may exceed context window.' }],
+        })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+
+      await screen.findByText(/1 row may exceed context window/i);
+      expect(await screen.findByRole('button', { name: 'Create Fileset' })).not.toBeDisabled();
+    });
+
+    it('does not show quality report section when purpose is not Dataset', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(makeQualityReport());
+      const user = userEvent.setup();
+      renderRoute();
+
+      // Switch to Generic purpose
+      await user.click(await screen.findByRole('radio', { name: 'Generic' }));
+      await uploadFile(user, makeJsonlFile());
+
+      expect(screen.queryByText(/all quality checks passed/i)).not.toBeInTheDocument();
+      expect(mockCheckDatasetQuality).not.toHaveBeenCalled();
+    });
+
+    it('clears quality reports when switching to the Sample Dataset tab', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({ hasErrors: false, hasWarnings: false })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+      await screen.findByText(/all quality checks passed/i);
+
+      await user.click(await screen.findByText('Sample Dataset'));
+
+      expect(screen.queryByText(/all quality checks passed/i)).not.toBeInTheDocument();
+    });
+
+    it('shows scanned-lines note when file has more lines than the scan limit', async () => {
+      mockCheckDatasetQuality.mockResolvedValue(
+        makeQualityReport({ scannedLines: 1000, totalLines: 5000 })
+      );
+      const user = userEvent.setup();
+      renderRoute();
+
+      await uploadFile(user, makeJsonlFile());
+
+      expect(await screen.findByText(/Scanned first 1,000 of 5,000 lines/i)).toBeInTheDocument();
+    });
+  });
 });
diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
index 74b252975a..4f6af8f027 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
@@ -8,6 +8,8 @@ import { RadioCard } from '@nemo/common/src/components/RadioCard';
 import { getEntityReference } from '@nemo/common/src/namedEntity';
 import { useToast } from '@nemo/common/src/providers/toast/useToast';
 import { FILESET_NAME_MAX_LENGTH, FILESET_NAME_REGEXP } from '@nemo/common/src/utils/filesetName';
+import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality';
+import type { DatasetQualityReport } from '@nemo/common/src/utils/datasetQuality';
 import {
   filesUploadFile,
   getFilesListFilesetFilesQueryKey,
@@ -59,7 +61,7 @@ import {
   storageConfigFromUrl,
 } from '@studio/util/storageConfigFromUrl';
 import { QueryObserverResult, useQueryClient } from '@tanstack/react-query';
-import { FileCheck } from 'lucide-react';
+import { AlertTriangle, FileCheck, XCircle, CheckCircle2 } from 'lucide-react';
 import { FC, useCallback, useMemo, useRef, useState } from 'react';
 import { Controller, useForm } from 'react-hook-form';
 import { useNavigate } from 'react-router-dom';
@@ -168,6 +170,59 @@ function toFileList(value: unknown): File[] {
   );
 }
 
+interface DatasetQualityReportViewProps {
+  report: DatasetQualityReport;
+}
+
+const DatasetQualityReportView: FC<DatasetQualityReportViewProps> = ({ report }) => {
+  const partialScanNote = report.scannedLines < report.totalLines && (
+    <Text kind="body/regular/sm" color="secondary">
+      Scanned first {report.scannedLines.toLocaleString()} of{' '}
+      {report.totalLines.toLocaleString()} lines.
+    </Text>
+  );
+
+  if (!report.hasErrors && !report.hasWarnings) {
+    return (
+      <Stack gap="density-xs">
+        <Flex gap="density-sm" align="center">
+          <CheckCircle2 size={16} className="text-green-500 shrink-0" />
+          <Text kind="body/regular/sm">{report.fileName}: all quality checks passed.</Text>
+        </Flex>
+        {partialScanNote}
+      </Stack>
+    );
+  }
+
+  return (
+    <Stack gap="density-sm">
+      <Text kind="label/bold/sm">{report.fileName}</Text>
+      {report.issues.map((issue, idx) => (
+        <Flex key={idx} gap="density-sm" align="start">
+          {issue.severity === 'error' ? (
+            <XCircle size={16} className="text-red-500 shrink-0 mt-0.5" />
+          ) : (
+            <AlertTriangle size={16} className="text-amber-500 shrink-0 mt-0.5" />
+          )}
+          <Stack gap="density-xs">
+            <Text kind="body/regular/sm">{issue.message}</Text>
+            {issue.affectedLines && issue.affectedLines.length > 0 && (
+              <Text kind="body/regular/sm" color="secondary">
+                {'Line' + (issue.affectedLines.length > 1 ? 's' : '') + ': '}
+                {issue.affectedLines.join(', ')}
+                {issue.count && issue.count > issue.affectedLines.length
+                  ? ` (+${issue.count - issue.affectedLines.length} more)`
+                  : ''}
+              </Text>
+            )}
+          </Stack>
+        </Flex>
+      ))}
+      {partialScanNote}
+    </Stack>
+  );
+};
+
 export const FilesetNewRoute: FC = () => {
   const workspace = useWorkspaceFromPath();
   const [activeTab, setActiveTab] = useState<DatasetType>(DATASET_TYPE_CUSTOM);
@@ -175,6 +230,8 @@ export const FilesetNewRoute: FC = () => {
     SAMPLE_DATASETS[0]
   );
   const [isSubmitPending, setIsSubmitPending] = useState(false);
+  const [qualityReports, setQualityReports] = useState<DatasetQualityReport[]>([]);
+  const [isValidating, setIsValidating] = useState(false);
   const navigate = useNavigate();
   const toast = useToast();
   const queryClient = useQueryClient();
@@ -205,6 +262,7 @@ export const FilesetNewRoute: FC = () => {
   });
 
   const url = watch('url');
+  const purpose = watch('purpose');
   const selectedSecretName = watch('secretKey');
   const secretKeyLabel = useMemo(() => {
     if (!url?.trim()) return 'Secret Key';
@@ -272,6 +330,33 @@ export const FilesetNewRoute: FC = () => {
     []
   );
 
+  /**
+   * Runs dataset quality checks on newly selected JSONL files and updates the report state.
+   * Only runs when purpose is 'dataset'; clears reports for other purposes or non-JSONL files.
+   */
+  const handleFilesChange = useCallback(
+    async (files: File[]) => {
+      setValue('files', files, { shouldValidate: false });
+
+      if (purpose !== FilesetPurpose.dataset) {
+        setQualityReports([]);
+        return;
+      }
+
+      const jsonlFiles = files.filter((f) => f.name.endsWith('.jsonl'));
+      if (jsonlFiles.length === 0) {
+        setQualityReports([]);
+        return;
+      }
+
+      setIsValidating(true);
+      const reports = await Promise.all(jsonlFiles.map(checkDatasetQuality));
+      setQualityReports(reports);
+      setIsValidating(false);
+    },
+    [purpose, setValue]
+  );
+
   // Sync hidden name/description when a sample is selected (sample tab = simulated local form)
   const handleSelectSample = useCallback(
     (dataset: SampleDataset) => {
@@ -282,10 +367,11 @@ export const FilesetNewRoute: FC = () => {
     [workspace, setValue]
   );
 
-  // When switching tabs, reset the opposite tab’s form state so we don’t leak values
+  // When switching tabs, reset the opposite tab's form state so we don't leak values
   const handleTabChange = useCallback(
     (value: DatasetType) => {
       setActiveTab(value);
+      setQualityReports([]);
       if (value === DATASET_TYPE_CUSTOM) {
         setValue('name', '', { shouldValidate: false });
         setValue('description', '', { shouldValidate: false });
@@ -311,6 +397,8 @@ export const FilesetNewRoute: FC = () => {
     ]
   );
 
+  const hasValidationErrors = qualityReports.some((r) => r.hasErrors);
+
   const onSubmit = useCallback(
     async (data: DatasetFormFields) => {
       const { success, error } = DatasetCreateFilesetFormSchema.safeParse(data);
@@ -319,6 +407,11 @@ export const FilesetNewRoute: FC = () => {
         return;
       }
 
+      if (hasValidationErrors) {
+        toast.error('Fix dataset validation errors before creating this fileset.');
+        return;
+      }
+
       setIsSubmitPending(true);
 
       // Step 1 (sample only): fetch sample files via lazy query
@@ -427,6 +520,7 @@ export const FilesetNewRoute: FC = () => {
       activeTab,
       createFilesetStep,
       getValues,
+      hasValidationErrors,
       navigate,
       storageTab,
       toast,
@@ -464,7 +558,7 @@ export const FilesetNewRoute: FC = () => {
             <Button
               type="button"
               color="brand"
-              disabled={isSubmitPending}
+              disabled={isSubmitPending || hasValidationErrors}
               onClick={handleSubmit(
                 onSubmit,
                 handleFormErrorsGeneric({ title: 'Fileset New Form Errors' })
@@ -564,6 +658,8 @@ export const FilesetNewRoute: FC = () => {
                         if (next === 'local') {
                           setValue('url', '', { shouldValidate: false });
                           setValue('secretKey', '', { shouldValidate: false });
+                        } else {
+                          setQualityReports([]);
                         }
                       }}
                     >
@@ -572,16 +668,34 @@ export const FilesetNewRoute: FC = () => {
                         <TabsTrigger value="external">External</TabsTrigger>
                       </TabsList>
                       <TabsContent className="w-full min-w-0 p-0 items-stretch" value="local">
-                        <Upload
-                          accept="text/csv,text/json,.jsonl,.parquet"
-                          multiple
-                          onValueChange={(files) => {
-                            const list = Array.isArray(files) ? files : files ? [files] : undefined;
-                            setValue('files', toFileList(list), { shouldValidate: false });
-                          }}
-                        >
-                          Supports JSONL, CSV, and Parquet files up to 50 MB.
-                        </Upload>
+                        <Stack gap="density-md" className="w-full">
+                          <Upload
+                            accept="text/csv,text/json,.jsonl,.parquet"
+                            multiple
+                            onValueChange={(files) => {
+                              const list = Array.isArray(files)
+                                ? files
+                                : files
+                                  ? [files]
+                                  : undefined;
+                              void handleFilesChange(toFileList(list));
+                            }}
+                          >
+                            Supports JSONL, CSV, and Parquet files up to 50 MB.
+                          </Upload>
+                          {purpose === FilesetPurpose.dataset && (
+                            <Stack gap="density-sm">
+                              {isValidating && (
+                                <Text kind="body/regular/sm" color="secondary">
+                                  Checking file quality…
+                                </Text>
+                              )}
+                              {qualityReports.map((report) => (
+                                <DatasetQualityReportView key={report.fileName} report={report} />
+                              ))}
+                            </Stack>
+                          )}
+                        </Stack>
                       </TabsContent>
                       <TabsContent className="w-full min-w-0 p-0 items-stretch" value="external">
                         <Stack gap="density-lg" className="w-full min-w-0">

From 777669d250ca2124b742abdbb8db7185986cae0c Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Tue, 9 Jun 2026 14:20:18 -0700
Subject: [PATCH 2/8] fix(studio): resolve lint errors in dataset quality
 validation (ASTD-73)

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 .../src/routes/FilesetNewRoute/index.spec.tsx | 35 +++++++++++++------
 .../src/routes/FilesetNewRoute/index.tsx      | 10 +++---
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx
index 75872df048..25139df904 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx
@@ -1,14 +1,23 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-import type { DatasetQualityReport } from '@nemo/common/src/utils/datasetQuality';
+import {
+  checkDatasetQuality,
+  type DatasetQualityReport,
+} from '@nemo/common/src/utils/datasetQuality';
 import { ROUTE_PARAMS } from '@studio/constants/routes';
+import { workspace1 } from '@studio/mocks/entity-store/projects';
+import { FilesetNewRoute } from '@studio/routes/FilesetNewRoute';
+import { mockUseNavigate, mockUseParams } from '@studio/tests/util/mockUseParams';
+import { render, screen } from '@studio/tests/util/render';
+import { TestProviders } from '@studio/tests/util/TestProviders';
+import { within } from '@testing-library/react';
+import userEvent from '@testing-library/user-event';
 
 vi.mock('@nemo/common/src/utils/datasetQuality', () => ({
   checkDatasetQuality: vi.fn(),
 }));
 
-import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality';
 const mockCheckDatasetQuality = vi.mocked(checkDatasetQuality);
 
 function makeQualityReport(overrides: Partial<DatasetQualityReport> = {}): DatasetQualityReport {
@@ -22,13 +31,6 @@ function makeQualityReport(overrides: Partial<DatasetQualityReport> = {}): Datas
     ...overrides,
   };
 }
-import { workspace1 } from '@studio/mocks/entity-store/projects';
-import { FilesetNewRoute } from '@studio/routes/FilesetNewRoute';
-import { mockUseNavigate, mockUseParams } from '@studio/tests/util/mockUseParams';
-import { render, screen } from '@studio/tests/util/render';
-import { TestProviders } from '@studio/tests/util/TestProviders';
-import { within } from '@testing-library/react';
-import userEvent from '@testing-library/user-event';
 
 const renderRoute = () => {
   return render(
@@ -212,6 +214,9 @@ describe('FilesetNewRoute', () => {
     }
 
     async function uploadFile(user: ReturnType<typeof userEvent.setup>, file: File) {
+      // The Upload component renders a visually hidden file input with no accessible name;
+      // querySelector is the only reliable way to reach it in jsdom.
+      // eslint-disable-next-line testing-library/no-node-access
       const input = document.querySelector('input[type="file"]') as HTMLInputElement;
       await user.upload(input, file);
     }
@@ -271,7 +276,9 @@ describe('FilesetNewRoute', () => {
 
       await uploadFile(user, makeJsonlFile());
 
-      expect(await screen.findByText(/No recognized fine-tuning schema detected/i)).toBeInTheDocument();
+      expect(
+        await screen.findByText(/No recognized fine-tuning schema detected/i)
+      ).toBeInTheDocument();
     });
 
     it('disables the Create Fileset button when quality report has errors', async () => {
@@ -294,7 +301,13 @@ describe('FilesetNewRoute', () => {
       mockCheckDatasetQuality.mockResolvedValue(
         makeQualityReport({
           hasWarnings: true,
-          issues: [{ severity: 'warning', code: 'LONG_ENTRIES', message: '1 row may exceed context window.' }],
+          issues: [
+            {
+              severity: 'warning',
+              code: 'LONG_ENTRIES',
+              message: '1 row may exceed context window.',
+            },
+          ],
         })
       );
       const user = userEvent.setup();
diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
index 4f6af8f027..783ce15aca 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
@@ -7,9 +7,11 @@ import { ControlledTextInput } from '@nemo/common/src/components/form/Controlled
 import { RadioCard } from '@nemo/common/src/components/RadioCard';
 import { getEntityReference } from '@nemo/common/src/namedEntity';
 import { useToast } from '@nemo/common/src/providers/toast/useToast';
+import {
+  checkDatasetQuality,
+  type DatasetQualityReport,
+} from '@nemo/common/src/utils/datasetQuality';
 import { FILESET_NAME_MAX_LENGTH, FILESET_NAME_REGEXP } from '@nemo/common/src/utils/filesetName';
-import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality';
-import type { DatasetQualityReport } from '@nemo/common/src/utils/datasetQuality';
 import {
   filesUploadFile,
   getFilesListFilesetFilesQueryKey,
@@ -177,8 +179,8 @@ interface DatasetQualityReportViewProps {
 const DatasetQualityReportView: FC<DatasetQualityReportViewProps> = ({ report }) => {
   const partialScanNote = report.scannedLines < report.totalLines && (
     <Text kind="body/regular/sm" color="secondary">
-      Scanned first {report.scannedLines.toLocaleString()} of{' '}
-      {report.totalLines.toLocaleString()} lines.
+      Scanned first {report.scannedLines.toLocaleString()} of {report.totalLines.toLocaleString()}{' '}
+      lines.
     </Text>
   );
 

From 6533e28775a727eff9996147ec266503e4b9639c Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Wed, 10 Jun 2026 12:22:33 -0700
Subject: [PATCH 3/8] fix(studio): scroll quality report into view on file
 validation (ASTD-73)

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 .../studio/src/routes/FilesetNewRoute/index.tsx      | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
index 783ce15aca..75fe7c87f8 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
@@ -64,7 +64,7 @@ import {
 } from '@studio/util/storageConfigFromUrl';
 import { QueryObserverResult, useQueryClient } from '@tanstack/react-query';
 import { AlertTriangle, FileCheck, XCircle, CheckCircle2 } from 'lucide-react';
-import { FC, useCallback, useMemo, useRef, useState } from 'react';
+import { FC, useCallback, useEffect, useMemo, useRef, useState } from 'react';
 import { Controller, useForm } from 'react-hook-form';
 import { useNavigate } from 'react-router-dom';
 import { z } from 'zod';
@@ -234,6 +234,14 @@ export const FilesetNewRoute: FC = () => {
   const [isSubmitPending, setIsSubmitPending] = useState(false);
   const [qualityReports, setQualityReports] = useState<DatasetQualityReport[]>([]);
   const [isValidating, setIsValidating] = useState(false);
+  const qualityReportRef = useRef<HTMLDivElement>(null);
+
+  useEffect(() => {
+    if (qualityReports.length > 0) {
+      qualityReportRef.current?.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+    }
+  }, [qualityReports]);
+
   const navigate = useNavigate();
   const toast = useToast();
   const queryClient = useQueryClient();
@@ -686,7 +694,7 @@ export const FilesetNewRoute: FC = () => {
                             Supports JSONL, CSV, and Parquet files up to 50 MB.
                           </Upload>
                           {purpose === FilesetPurpose.dataset && (
-                            <Stack gap="density-sm">
+                            <Stack gap="density-sm" ref={qualityReportRef}>
                               {isValidating && (
                                 <Text kind="body/regular/sm" color="secondary">
                                   Checking file quality…

From 199f23981766ee62b9b69fb8fba5d9b5082df055 Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Wed, 10 Jun 2026 12:51:56 -0700
Subject: [PATCH 4/8] refactor(studio): move getTextWithCount to common

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 web/packages/common/src/utils/datasetQuality.ts | 11 ++++-------
 web/packages/common/src/utils/formatters.ts     | 17 +++++++++++++++++
 web/packages/studio/src/util/strings.ts         | 17 +----------------
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/web/packages/common/src/utils/datasetQuality.ts b/web/packages/common/src/utils/datasetQuality.ts
index 7531d3440a..b6e1f8894c 100644
--- a/web/packages/common/src/utils/datasetQuality.ts
+++ b/web/packages/common/src/utils/datasetQuality.ts
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 import { findMessagesArray } from '@nemo/common/src/utils/file';
+import { getTextWithCount } from '@nemo/common/src/utils/formatters';
 
 export type DatasetQualityCode =
   | 'EMPTY_FILE'
@@ -39,10 +40,6 @@ const LONG_ENTRY_CHAR_THRESHOLD = 32_768;
 const PROMPT_KEYS = ['prompt', 'question'];
 const COMPLETION_KEYS = ['completion', 'ideal_response', 'response', 'output', 'answer'];
 
-function plural(n: number, word: string): string {
-  return `${n} ${word}${n === 1 ? '' : 's'}`;
-}
-
 /**
  * Runs dataset quality checks on a JSONL file and returns a structured report.
  * Errors indicate the file should not be uploaded as-is; warnings are advisory.
@@ -127,7 +124,7 @@ export async function checkDatasetQuality(file: File): Promise<DatasetQualityRep
     issues.push({
       severity: 'error',
       code: 'INVALID_JSON_LINES',
-      message: `${plural(invalidLineNums.length, 'line')} could not be parsed as JSON objects.`,
+      message: `${getTextWithCount('line', invalidLineNums.length)} could not be parsed as JSON objects.`,
       affectedLines: invalidLineNums.slice(0, 10),
       count: invalidLineNums.length,
     });
@@ -161,7 +158,7 @@ export async function checkDatasetQuality(file: File): Promise<DatasetQualityRep
       issues.push({
         severity: 'warning',
         code: 'NULL_OR_EMPTY_FIELDS',
-        message: `${plural(nullFieldLines.length, 'row')} contain null or empty field values.`,
+        message: `${getTextWithCount('row', nullFieldLines.length)} contain null or empty field values.`,
         affectedLines: nullFieldLines.slice(0, 10),
         count: nullFieldLines.length,
       });
@@ -178,7 +175,7 @@ export async function checkDatasetQuality(file: File): Promise<DatasetQualityRep
       issues.push({
         severity: 'warning',
         code: 'LONG_ENTRIES',
-        message: `${plural(longLines.length, 'row')} may exceed the model's context window (~8,192 tokens).`,
+        message: `${getTextWithCount('row', longLines.length)} may exceed the model's context window (~8,192 tokens).`,
         affectedLines: longLines.slice(0, 10),
         count: longLines.length,
       });
diff --git a/web/packages/common/src/utils/formatters.ts b/web/packages/common/src/utils/formatters.ts
index 13eafdfea3..e7949d9d1e 100644
--- a/web/packages/common/src/utils/formatters.ts
+++ b/web/packages/common/src/utils/formatters.ts
@@ -1,6 +1,23 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * Returns a formatted string with count and properly pluralized text.
+ * @param text - The singular form of the word (e.g., "entry", "file")
+ * @param count - The number to display
+ * @param plural - Optional custom plural form for irregular words (e.g., "entries", "children").
+ *                 If not provided, defaults to appending 's' to the text.
+ * @returns Formatted string like "1 entry" or "3 entries"
+ * @example
+ * getTextWithCount('file', 1)           // "1 file"
+ * getTextWithCount('file', 3)           // "3 files"
+ * getTextWithCount('entry', 2, 'entries') // "2 entries"
+ */
+export const getTextWithCount = (text: string, count: number, plural?: string) => {
+  const pluralForm = plural ?? `${text}s`;
+  return `${count} ${count !== 1 ? pluralForm : text}`;
+};
+
 /**
  * Truncates a long string of text to the length specified by `maxCharacters` by replacing a
  * section of the text with an ellipsis.
diff --git a/web/packages/studio/src/util/strings.ts b/web/packages/studio/src/util/strings.ts
index edbb489ad3..3e851ea97c 100644
--- a/web/packages/studio/src/util/strings.ts
+++ b/web/packages/studio/src/util/strings.ts
@@ -4,22 +4,7 @@
 import { Row } from '@studio/util/files';
 import Papa from 'papaparse';
 
-/**
- * Returns a formatted string with count and properly pluralized text.
- * @param text - The singular form of the word (e.g., "entry", "file")
- * @param count - The number to display
- * @param plural - Optional custom plural form for irregular words (e.g., "entries", "children").
- *                 If not provided, defaults to appending 's' to the text.
- * @returns Formatted string like "1 entry" or "3 entries"
- * @example
- * getTextWithCount('file', 1)           // "1 file"
- * getTextWithCount('file', 3)           // "3 files"
- * getTextWithCount('entry', 2, 'entries') // "2 entries"
- */
-export const getTextWithCount = (text: string, count: number, plural?: string) => {
-  const pluralForm = plural ?? `${text}s`;
-  return `${count} ${count !== 1 ? pluralForm : text}`;
-};
+export { getTextWithCount } from '@nemo/common/src/utils/formatters';
 
 export const capitalize = (str: string) => {
   return str.charAt(0).toUpperCase() + str.slice(1);

From a1c4ab3c698d206bef9a4e2ba8cbca994a755ae6 Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Thu, 11 Jun 2026 16:27:00 -0700
Subject: [PATCH 5/8] fix

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 .../src/routes/FilesetNewRoute/index.tsx      | 53 -------------------
 1 file changed, 53 deletions(-)

diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
index e87063c619..0d9c3792a4 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
@@ -174,59 +174,6 @@ function toFileList(value: unknown): File[] {
   );
 }
 
-interface DatasetQualityReportViewProps {
-  report: DatasetQualityReport;
-}
-
-const DatasetQualityReportView: FC<DatasetQualityReportViewProps> = ({ report }) => {
-  const partialScanNote = report.scannedLines < report.totalLines && (
-    <Text kind="body/regular/sm" color="secondary">
-      Scanned first {report.scannedLines.toLocaleString()} of {report.totalLines.toLocaleString()}{' '}
-      lines.
-    </Text>
-  );
-
-  if (!report.hasErrors && !report.hasWarnings) {
-    return (
-      <Stack gap="density-xs">
-        <Flex gap="density-sm" align="center">
-          <CheckCircle2 size={16} className="text-green-500 shrink-0" />
-          <Text kind="body/regular/sm">{report.fileName}: all quality checks passed.</Text>
-        </Flex>
-        {partialScanNote}
-      </Stack>
-    );
-  }
-
-  return (
-    <Stack gap="density-sm">
-      <Text kind="label/bold/sm">{report.fileName}</Text>
-      {report.issues.map((issue, idx) => (
-        <Flex key={idx} gap="density-sm" align="start">
-          {issue.severity === 'error' ? (
-            <XCircle size={16} className="text-red-500 shrink-0 mt-0.5" />
-          ) : (
-            <AlertTriangle size={16} className="text-amber-500 shrink-0 mt-0.5" />
-          )}
-          <Stack gap="density-xs">
-            <Text kind="body/regular/sm">{issue.message}</Text>
-            {issue.affectedLines && issue.affectedLines.length > 0 && (
-              <Text kind="body/regular/sm" color="secondary">
-                {'Line' + (issue.affectedLines.length > 1 ? 's' : '') + ': '}
-                {issue.affectedLines.join(', ')}
-                {issue.count && issue.count > issue.affectedLines.length
-                  ? ` (+${issue.count - issue.affectedLines.length} more)`
-                  : ''}
-              </Text>
-            )}
-          </Stack>
-        </Flex>
-      ))}
-      {partialScanNote}
-    </Stack>
-  );
-};
-
 export const FilesetNewRoute: FC = () => {
   const workspace = useWorkspaceFromPath();
   const [activeTab, setActiveTab] = useState<DatasetType>(DATASET_TYPE_CUSTOM);

From ecba934e82fa40769e10812654d90b34816636e4 Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Thu, 11 Jun 2026 16:28:39 -0700
Subject: [PATCH 6/8] fix

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 web/packages/studio/src/routes/FilesetNewRoute/index.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
index 0d9c3792a4..7929c8826e 100644
--- a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
+++ b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx
@@ -65,7 +65,7 @@ import {
   storageConfigFromUrl,
 } from '@studio/util/storageConfigFromUrl';
 import { QueryObserverResult, useQueryClient } from '@tanstack/react-query';
-import { AlertTriangle, FileCheck, XCircle, CheckCircle2 } from 'lucide-react';
+import { FileCheck } from 'lucide-react';
 import { FC, useCallback, useEffect, useMemo, useRef, useState } from 'react';
 import { Controller, useForm } from 'react-hook-form';
 import { useNavigate } from 'react-router-dom';

From e1c5264580bf0edd71cf66011d70b0230c68eb05 Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Fri, 12 Jun 2026 14:06:51 -0700
Subject: [PATCH 7/8] feedback

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 .../common/src/utils/datasetQuality.ts         |  2 +-
 .../common/src/utils/formatters.spec.ts        | 18 ++++++++++++++++++
 web/packages/studio/src/util/strings.spec.ts   | 16 +---------------
 3 files changed, 20 insertions(+), 16 deletions(-)
 create mode 100644 web/packages/common/src/utils/formatters.spec.ts

diff --git a/web/packages/common/src/utils/datasetQuality.ts b/web/packages/common/src/utils/datasetQuality.ts
index c98b4a27ef..fa1346227d 100644
--- a/web/packages/common/src/utils/datasetQuality.ts
+++ b/web/packages/common/src/utils/datasetQuality.ts
@@ -160,7 +160,7 @@ export async function checkDatasetQuality(file: File): Promise<DatasetQualityRep
       issues.push({
         severity: 'warning',
         code: 'NULL_OR_EMPTY_FIELDS',
-        message: `${getTextWithCount('row', nullFieldLines.length)} contain null or empty field values.`,
+        message: `${getTextWithCount('row', nullFieldLines.length)} contains null or empty field values.`,
         affectedLines: nullFieldLines.slice(0, MAX_AFFECTED_LINE_COUNT),
         count: nullFieldLines.length,
       });
diff --git a/web/packages/common/src/utils/formatters.spec.ts b/web/packages/common/src/utils/formatters.spec.ts
new file mode 100644
index 0000000000..dead8578a2
--- /dev/null
+++ b/web/packages/common/src/utils/formatters.spec.ts
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { getTextWithCount } from '@nemo/common/src/utils/formatters';
+
+describe('#getTextWithCount', () => {
+  it('should return the correct text with count using default suffix', () => {
+    expect(getTextWithCount('test', 0)).toBe('0 tests');
+    expect(getTextWithCount('test', 1)).toBe('1 test');
+    expect(getTextWithCount('test', 2)).toBe('2 tests');
+  });
+
+  it('should return the correct text with count using plural', () => {
+    expect(getTextWithCount('entry', 0, 'entries')).toBe('0 entries');
+    expect(getTextWithCount('entry', 1, 'entries')).toBe('1 entry');
+    expect(getTextWithCount('entry', 2, 'entries')).toBe('2 entries');
+  });
+});
diff --git a/web/packages/studio/src/util/strings.spec.ts b/web/packages/studio/src/util/strings.spec.ts
index b882ff85cb..2ab8b828d7 100644
--- a/web/packages/studio/src/util/strings.spec.ts
+++ b/web/packages/studio/src/util/strings.spec.ts
@@ -1,21 +1,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-import { capitalize, formatKeyLabel, getTextWithCount, parseCSV } from '@studio/util/strings';
-
-describe('#getTextWithCount', () => {
-  it('should return the correct text with count using default suffix', () => {
-    expect(getTextWithCount('test', 0)).toBe('0 tests');
-    expect(getTextWithCount('test', 1)).toBe('1 test');
-    expect(getTextWithCount('test', 2)).toBe('2 tests');
-  });
-
-  it('should return the correct text with count using plural', () => {
-    expect(getTextWithCount('entry', 0, 'entries')).toBe('0 entries');
-    expect(getTextWithCount('entry', 1, 'entries')).toBe('1 entry');
-    expect(getTextWithCount('entry', 2, 'entries')).toBe('2 entries');
-  });
-});
+import { capitalize, formatKeyLabel, parseCSV } from '@studio/util/strings';
 
 describe('#formatKeyLabel', () => {
   it.each([

From e4455b5f660c91ae677eb6e0b931277d42669cbe Mon Sep 17 00:00:00 2001
From: Alex Ray <alray@nvidia.com>
Date: Fri, 12 Jun 2026 15:51:27 -0700
Subject: [PATCH 8/8] another one

Signed-off-by: Alex Ray <alray@nvidia.com>
---
 sdk/stainless.yaml | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/sdk/stainless.yaml b/sdk/stainless.yaml
index 7d5a9da0e6..473c2c6a45 100644
--- a/sdk/stainless.yaml
+++ b/sdk/stainless.yaml
@@ -254,8 +254,6 @@ resources:
       filesets:
         models:
           fileset_filter: FilesetFilter
-          fileset_metadata: FilesetMetadataOutput
-          fileset_metadata_param: FilesetMetadataInput
         methods:
           create: post /apis/files/v2/workspaces/{workspace}/filesets
           list: get /apis/files/v2/workspaces/{workspace}/filesets
@@ -710,12 +708,12 @@ resources:
       auth_context: AuthContext
       tool_call_config: ToolCallConfig
       model_metadata_content: ModelMetadataContent
-      # DatasetMetadataContent is referenced by the shared FilesetMetadata schema.
-      # Keep it in $shared so the generated import is intra-package
-      # (shared/fileset_metadata -> shared/dataset_metadata_content) and avoids the
-      # circular import that occurs when it lives under the `files` resource:
-      #   shared/fileset_metadata -> files/dataset_metadata_content -> files/__init__
-      #     -> files/fileset -> shared/fileset_metadata
+      # FilesetMetadata is referenced by multiple resources (files, models) at the source
+      # level. Pin both Output and Input variants in $shared so sync-models does not re-home
+      # them under `files`, which would break ``from nemo_platform.types.shared import
+      # FilesetMetadata`` imports downstream (models service tests).
+      fileset_metadata: FilesetMetadataOutput
+      fileset_metadata_param: FilesetMetadataInput
       dataset_metadata_content: DatasetMetadataContent
       tool_calling_metadata_content: ToolCallingMetadataContent
       backend_format: BackendFormat