From 19c34de7207312c965244db7280e3dc709e5666b Mon Sep 17 00:00:00 2001 From: Alex Ray Date: Tue, 9 Jun 2026 11:14:59 -0700 Subject: [PATCH 1/8] feat(studio): add client-side dataset quality checks on JSONL upload (ASTD-73) Validates JSONL files on file select when purpose=dataset, surfacing errors (UTF-16 encoding, invalid JSON lines, empty file) and warnings (unknown schema, null/empty fields, entries exceeding ~8192 tokens) inline before upload. Submit is blocked until errors are resolved. Signed-off-by: Alex Ray --- .../common/src/utils/datasetQuality.test.ts | 220 ++++++++++++++++++ .../common/src/utils/datasetQuality.ts | 192 +++++++++++++++ .../src/routes/FilesetNewRoute/index.spec.tsx | 167 +++++++++++++ .../src/routes/FilesetNewRoute/index.tsx | 140 +++++++++-- 4 files changed, 706 insertions(+), 13 deletions(-) create mode 100644 web/packages/common/src/utils/datasetQuality.test.ts create mode 100644 web/packages/common/src/utils/datasetQuality.ts diff --git a/web/packages/common/src/utils/datasetQuality.test.ts b/web/packages/common/src/utils/datasetQuality.test.ts new file mode 100644 index 0000000000..ae0ddb8803 --- /dev/null +++ b/web/packages/common/src/utils/datasetQuality.test.ts @@ -0,0 +1,220 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality'; + +function makeFile(content: string, name = 'train.jsonl', bytes?: Uint8Array): File { + const file = new File([content], name, { type: 'application/x-jsonlines' }); + file.text = vi.fn().mockResolvedValue(content); + file.arrayBuffer = vi + .fn() + .mockResolvedValue(bytes ? bytes.buffer : new TextEncoder().encode(content).buffer); + return file; +} + +function utf16File(content: string, le = true): File { + const bom = le ? new Uint8Array([0xff, 0xfe]) : new Uint8Array([0xfe, 0xff]); + const encoded = new TextEncoder().encode(content); + const merged = new Uint8Array(bom.length + encoded.length); + merged.set(bom); + merged.set(encoded, bom.length); + return makeFile(content, 'train.jsonl', merged); +} + +function jsonlLines(rows: object[]): string { + return rows.map((r) => JSON.stringify(r)).join('\n'); +} + +describe('checkDatasetQuality', () => { + describe('encoding', () => { + it('returns INVALID_ENCODING error for UTF-16 LE', async () => { + const report = await checkDatasetQuality(utf16File('some content', true)); + expect(report.hasErrors).toBe(true); + expect(report.issues[0].code).toBe('INVALID_ENCODING'); + expect(report.issues[0].severity).toBe('error'); + }); + + it('returns INVALID_ENCODING error for UTF-16 BE', async () => { + const report = await checkDatasetQuality(utf16File('some content', false)); + expect(report.hasErrors).toBe(true); + expect(report.issues[0].code).toBe('INVALID_ENCODING'); + }); + + it('does not flag UTF-8 files', async () => { + const file = makeFile( + jsonlLines([ + { + messages: [ + { role: 'user', content: 'hi' }, + { role: 'assistant', content: 'hello' }, + ], + }, + ]) + ); + const report = await checkDatasetQuality(file); + expect(report.issues.find((i) => i.code === 'INVALID_ENCODING')).toBeUndefined(); + }); + }); + + describe('empty file', () => { + it('returns EMPTY_FILE error for blank file', async () => { + const report = await checkDatasetQuality(makeFile('')); + expect(report.hasErrors).toBe(true); + expect(report.issues[0].code).toBe('EMPTY_FILE'); + }); + + it('returns EMPTY_FILE error for whitespace-only file', async () => { + const report = await checkDatasetQuality(makeFile(' \n\t\n ')); + expect(report.hasErrors).toBe(true); + expect(report.issues[0].code).toBe('EMPTY_FILE'); + }); + }); + + describe('JSON parsing', () => { + it('returns INVALID_JSON_LINES error for malformed lines', async () => { + const content = jsonlLines([{ prompt: 'q', completion: 'a' }]) + '\nnot-json\nalso bad'; + const report = await checkDatasetQuality(makeFile(content)); + const issue = report.issues.find((i) => i.code === 'INVALID_JSON_LINES'); + expect(issue).toBeDefined(); + expect(issue?.severity).toBe('error'); + expect(issue?.count).toBe(2); + expect(issue?.affectedLines).toContain(2); + expect(issue?.affectedLines).toContain(3); + }); + + it('flags JSON arrays and scalars as invalid lines (not JSON objects)', async () => { + const content = '["not", "an", "object"]\n42'; + const report = await checkDatasetQuality(makeFile(content)); + const issue = report.issues.find((i) => i.code === 'INVALID_JSON_LINES'); + expect(issue?.count).toBe(2); + }); + + it('has no parse error for valid JSONL', async () => { + const content = jsonlLines([ + { prompt: 'q', completion: 'a' }, + { prompt: 'q2', completion: 'a2' }, + ]); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.issues.find((i) => i.code === 'INVALID_JSON_LINES')).toBeUndefined(); + }); + }); + + describe('schema detection', () => { + it('no warning for messages schema', async () => { + const row = { + messages: [ + { role: 'user', content: 'hi' }, + { role: 'assistant', content: 'hello' }, + ], + }; + const report = await checkDatasetQuality(makeFile(jsonlLines([row]))); + expect(report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA')).toBeUndefined(); + }); + + it('no warning for prompt/completion schema', async () => { + const report = await checkDatasetQuality( + makeFile(jsonlLines([{ prompt: 'q', completion: 'a' }])) + ); + expect(report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA')).toBeUndefined(); + }); + + it('no warning for question/ideal_response schema', async () => { + const report = await checkDatasetQuality( + makeFile(jsonlLines([{ question: 'q', ideal_response: 'a' }])) + ); + expect(report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA')).toBeUndefined(); + }); + + it('returns UNKNOWN_SCHEMA warning for unrecognized fields', async () => { + // None of these keys match known schema patterns (no messages, prompt, completion, etc.) + const report = await checkDatasetQuality( + makeFile(jsonlLines([{ topic: 'foo', category: 'bar', label: 1 }])) + ); + const issue = report.issues.find((i) => i.code === 'UNKNOWN_SCHEMA'); + expect(issue).toBeDefined(); + expect(issue?.severity).toBe('warning'); + }); + }); + + describe('null and empty fields', () => { + it('returns NULL_OR_EMPTY_FIELDS warning for null values', async () => { + const content = jsonlLines([ + { prompt: 'q', completion: null }, + { prompt: 'q2', completion: 'a2' }, + ]); + const report = await checkDatasetQuality(makeFile(content)); + const issue = report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS'); + expect(issue).toBeDefined(); + expect(issue?.count).toBe(1); + expect(issue?.affectedLines).toContain(1); + }); + + it('returns NULL_OR_EMPTY_FIELDS warning for empty string values', async () => { + const content = jsonlLines([{ prompt: '', completion: 'a' }]); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS')).toBeDefined(); + }); + + it('returns NULL_OR_EMPTY_FIELDS warning for empty arrays', async () => { + const content = jsonlLines([{ messages: [] }]); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS')).toBeDefined(); + }); + + it('no warning when all fields are populated', async () => { + const content = jsonlLines([{ prompt: 'q', completion: 'a' }]); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS')).toBeUndefined(); + }); + }); + + describe('long entries', () => { + it('returns LONG_ENTRIES warning when a line exceeds 32768 chars', async () => { + const longValue = 'x'.repeat(33_000); + const content = jsonlLines([{ prompt: longValue, completion: 'a' }]); + const report = await checkDatasetQuality(makeFile(content)); + const issue = report.issues.find((i) => i.code === 'LONG_ENTRIES'); + expect(issue).toBeDefined(); + expect(issue?.severity).toBe('warning'); + expect(issue?.affectedLines).toContain(1); + }); + + it('no warning for normal-length lines', async () => { + const content = jsonlLines([{ prompt: 'short question', completion: 'short answer' }]); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.issues.find((i) => i.code === 'LONG_ENTRIES')).toBeUndefined(); + }); + }); + + describe('line scanning limit', () => { + it('scans only first 1000 lines for large files', async () => { + const rows = Array.from({ length: 1500 }, (_, i) => ({ + prompt: `q${i}`, + completion: `a${i}`, + })); + const content = jsonlLines(rows); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.totalLines).toBe(1500); + expect(report.scannedLines).toBe(1000); + }); + }); + + describe('report structure', () => { + it('sets hasErrors and hasWarnings correctly', async () => { + const content = jsonlLines([{ prompt: 'q', completion: 'a' }]); + const report = await checkDatasetQuality(makeFile(content)); + expect(report.hasErrors).toBe(false); + expect(report.hasWarnings).toBe(false); + expect(report.fileName).toBe('train.jsonl'); + }); + + it('caps affectedLines at 10 entries', async () => { + const rows = Array.from({ length: 15 }, () => ({ prompt: '', completion: '' })); + const content = jsonlLines(rows); + const report = await checkDatasetQuality(makeFile(content)); + const issue = report.issues.find((i) => i.code === 'NULL_OR_EMPTY_FIELDS'); + expect(issue?.affectedLines?.length).toBeLessThanOrEqual(10); + expect(issue?.count).toBe(15); + }); + }); +}); diff --git a/web/packages/common/src/utils/datasetQuality.ts b/web/packages/common/src/utils/datasetQuality.ts new file mode 100644 index 0000000000..7531d3440a --- /dev/null +++ b/web/packages/common/src/utils/datasetQuality.ts @@ -0,0 +1,192 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { findMessagesArray } from '@nemo/common/src/utils/file'; + +export type DatasetQualityCode = + | 'EMPTY_FILE' + | 'INVALID_ENCODING' + | 'INVALID_JSON_LINES' + | 'UNKNOWN_SCHEMA' + | 'NULL_OR_EMPTY_FIELDS' + | 'LONG_ENTRIES'; + +export interface DatasetQualityIssue { + severity: 'error' | 'warning'; + code: DatasetQualityCode; + message: string; + /** 1-based line numbers affected (first 10 only) */ + affectedLines?: number[]; + /** Total count of affected lines */ + count?: number; +} + +export interface DatasetQualityReport { + fileName: string; + hasErrors: boolean; + hasWarnings: boolean; + issues: DatasetQualityIssue[]; + /** Number of lines actually scanned (may be less than totalLines for large files) */ + scannedLines: number; + totalLines: number; +} + +const MAX_SCAN_LINES = 1000; + +/** ~8192 tokens at ~4 chars/token */ +const LONG_ENTRY_CHAR_THRESHOLD = 32_768; + +const PROMPT_KEYS = ['prompt', 'question']; +const COMPLETION_KEYS = ['completion', 'ideal_response', 'response', 'output', 'answer']; + +function plural(n: number, word: string): string { + return `${n} ${word}${n === 1 ? '' : 's'}`; +} + +/** + * Runs dataset quality checks on a JSONL file and returns a structured report. + * Errors indicate the file should not be uploaded as-is; warnings are advisory. + * + * Checks performed: + * - UTF-16 BOM detection (error) + * - Empty file (error) + * - Invalid JSON on any line (error) + * - Unknown fine-tuning schema — no messages or prompt/completion fields (warning) + * - Null or empty field values (warning) + * - Lines exceeding estimated context window (~8192 tokens) (warning) + * + * For files with more than 1000 lines, only the first 1000 are scanned. + */ +export async function checkDatasetQuality(file: File): Promise { + const issues: DatasetQualityIssue[] = []; + + // 1. Encoding — detect UTF-16 via BOM before reading as text + const buffer = await file.arrayBuffer(); + const bytes = new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength)); + const isUtf16Le = bytes[0] === 0xff && bytes[1] === 0xfe; + const isUtf16Be = bytes[0] === 0xfe && bytes[1] === 0xff; + if (isUtf16Le || isUtf16Be) { + issues.push({ + severity: 'error', + code: 'INVALID_ENCODING', + message: 'File is UTF-16 encoded. Re-save as UTF-8 before uploading.', + }); + return { + fileName: file.name, + hasErrors: true, + hasWarnings: false, + issues, + scannedLines: 0, + totalLines: 0, + }; + } + + // 2. Read text and split into non-empty lines + const text = await file.text(); + const allLines = text.split('\n').filter((l) => l.trim().length > 0); + const totalLines = allLines.length; + + if (totalLines === 0) { + issues.push({ + severity: 'error', + code: 'EMPTY_FILE', + message: 'File is empty or contains only whitespace.', + }); + return { + fileName: file.name, + hasErrors: true, + hasWarnings: false, + issues, + scannedLines: 0, + totalLines: 0, + }; + } + + const scanLines = allLines.slice(0, MAX_SCAN_LINES); + const scannedLines = scanLines.length; + + // 3. Parse each line — collect invalid and valid rows separately + const invalidLineNums: number[] = []; + const parsedRows: Array<{ lineNum: number; row: Record }> = []; + + for (let i = 0; i < scanLines.length; i++) { + try { + const parsed: unknown = JSON.parse(scanLines[i]); + if (parsed !== null && typeof parsed === 'object' && !Array.isArray(parsed)) { + parsedRows.push({ lineNum: i + 1, row: parsed as Record }); + } else { + // Valid JSON but not an object (array, scalar, null) — not a valid JSONL dataset row + invalidLineNums.push(i + 1); + } + } catch { + invalidLineNums.push(i + 1); + } + } + + if (invalidLineNums.length > 0) { + issues.push({ + severity: 'error', + code: 'INVALID_JSON_LINES', + message: `${plural(invalidLineNums.length, 'line')} could not be parsed as JSON objects.`, + affectedLines: invalidLineNums.slice(0, 10), + count: invalidLineNums.length, + }); + } + + if (parsedRows.length > 0) { + // 4. Schema detection on the first valid row + const firstRow = parsedRows[0].row; + const hasMessagesSchema = findMessagesArray(firstRow) !== null; + const hasPromptCompletionSchema = + PROMPT_KEYS.some((k) => k in firstRow) || COMPLETION_KEYS.some((k) => k in firstRow); + + if (!hasMessagesSchema && !hasPromptCompletionSchema) { + issues.push({ + severity: 'warning', + code: 'UNKNOWN_SCHEMA', + message: + 'No recognized fine-tuning schema detected. Expected a messages array or prompt/completion fields.', + }); + } + + // 5. Null or empty field values across all valid rows + const nullFieldLines: number[] = []; + for (const { lineNum, row } of parsedRows) { + const hasNullOrEmpty = Object.values(row).some( + (v) => v === null || v === '' || (Array.isArray(v) && v.length === 0) + ); + if (hasNullOrEmpty) nullFieldLines.push(lineNum); + } + if (nullFieldLines.length > 0) { + issues.push({ + severity: 'warning', + code: 'NULL_OR_EMPTY_FIELDS', + message: `${plural(nullFieldLines.length, 'row')} contain null or empty field values.`, + affectedLines: nullFieldLines.slice(0, 10), + count: nullFieldLines.length, + }); + } + + // 6. Long entries — rough token estimate via character count + const longLines: number[] = []; + for (let i = 0; i < scanLines.length; i++) { + if (scanLines[i].length > LONG_ENTRY_CHAR_THRESHOLD) { + longLines.push(i + 1); + } + } + if (longLines.length > 0) { + issues.push({ + severity: 'warning', + code: 'LONG_ENTRIES', + message: `${plural(longLines.length, 'row')} may exceed the model's context window (~8,192 tokens).`, + affectedLines: longLines.slice(0, 10), + count: longLines.length, + }); + } + } + + const hasErrors = issues.some((i) => i.severity === 'error'); + const hasWarnings = issues.some((i) => i.severity === 'warning'); + + return { fileName: file.name, hasErrors, hasWarnings, issues, scannedLines, totalLines }; +} diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx index 961b780481..75872df048 100644 --- a/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx +++ b/web/packages/studio/src/routes/FilesetNewRoute/index.spec.tsx @@ -1,7 +1,27 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import type { DatasetQualityReport } from '@nemo/common/src/utils/datasetQuality'; import { ROUTE_PARAMS } from '@studio/constants/routes'; + +vi.mock('@nemo/common/src/utils/datasetQuality', () => ({ + checkDatasetQuality: vi.fn(), +})); + +import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality'; +const mockCheckDatasetQuality = vi.mocked(checkDatasetQuality); + +function makeQualityReport(overrides: Partial = {}): DatasetQualityReport { + return { + fileName: 'train.jsonl', + hasErrors: false, + hasWarnings: false, + issues: [], + scannedLines: 10, + totalLines: 10, + ...overrides, + }; +} import { workspace1 } from '@studio/mocks/entity-store/projects'; import { FilesetNewRoute } from '@studio/routes/FilesetNewRoute'; import { mockUseNavigate, mockUseParams } from '@studio/tests/util/mockUseParams'; @@ -179,4 +199,151 @@ describe('FilesetNewRoute', () => { expect(screen.queryByText(/must start with a lowercase letter/i)).not.toBeInTheDocument(); }); + + describe('dataset quality validation', () => { + beforeEach(() => { + mockCheckDatasetQuality.mockReset(); + }); + + function makeJsonlFile(name = 'train.jsonl'): File { + return new File(['{"prompt":"q","completion":"a"}'], name, { + type: 'application/x-jsonlines', + }); + } + + async function uploadFile(user: ReturnType, file: File) { + const input = document.querySelector('input[type="file"]') as HTMLInputElement; + await user.upload(input, file); + } + + it('shows quality report after uploading a JSONL file when purpose is Dataset', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ hasErrors: false, hasWarnings: false }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + + expect(await screen.findByText(/all quality checks passed/i)).toBeInTheDocument(); + }); + + it('shows error issues from the quality report', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ + hasErrors: true, + issues: [ + { + severity: 'error', + code: 'INVALID_JSON_LINES', + message: '2 lines could not be parsed as JSON objects.', + affectedLines: [3, 7], + count: 2, + }, + ], + }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + + expect( + await screen.findByText(/2 lines could not be parsed as JSON objects/i) + ).toBeInTheDocument(); + }); + + it('shows warning issues from the quality report', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ + hasWarnings: true, + issues: [ + { + severity: 'warning', + code: 'UNKNOWN_SCHEMA', + message: 'No recognized fine-tuning schema detected.', + }, + ], + }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + + expect(await screen.findByText(/No recognized fine-tuning schema detected/i)).toBeInTheDocument(); + }); + + it('disables the Create Fileset button when quality report has errors', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ + hasErrors: true, + issues: [{ severity: 'error', code: 'EMPTY_FILE', message: 'File is empty.' }], + }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + + await screen.findByText(/File is empty/i); + expect(await screen.findByRole('button', { name: 'Create Fileset' })).toBeDisabled(); + }); + + it('does not disable submit for warning-only reports', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ + hasWarnings: true, + issues: [{ severity: 'warning', code: 'LONG_ENTRIES', message: '1 row may exceed context window.' }], + }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + + await screen.findByText(/1 row may exceed context window/i); + expect(await screen.findByRole('button', { name: 'Create Fileset' })).not.toBeDisabled(); + }); + + it('does not show quality report section when purpose is not Dataset', async () => { + mockCheckDatasetQuality.mockResolvedValue(makeQualityReport()); + const user = userEvent.setup(); + renderRoute(); + + // Switch to Generic purpose + await user.click(await screen.findByRole('radio', { name: 'Generic' })); + await uploadFile(user, makeJsonlFile()); + + expect(screen.queryByText(/all quality checks passed/i)).not.toBeInTheDocument(); + expect(mockCheckDatasetQuality).not.toHaveBeenCalled(); + }); + + it('clears quality reports when switching to the Sample Dataset tab', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ hasErrors: false, hasWarnings: false }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + await screen.findByText(/all quality checks passed/i); + + await user.click(await screen.findByText('Sample Dataset')); + + expect(screen.queryByText(/all quality checks passed/i)).not.toBeInTheDocument(); + }); + + it('shows scanned-lines note when file has more lines than the scan limit', async () => { + mockCheckDatasetQuality.mockResolvedValue( + makeQualityReport({ scannedLines: 1000, totalLines: 5000 }) + ); + const user = userEvent.setup(); + renderRoute(); + + await uploadFile(user, makeJsonlFile()); + + expect(await screen.findByText(/Scanned first 1,000 of 5,000 lines/i)).toBeInTheDocument(); + }); + }); }); diff --git a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx index 74b252975a..4f6af8f027 100644 --- a/web/packages/studio/src/routes/FilesetNewRoute/index.tsx +++ b/web/packages/studio/src/routes/FilesetNewRoute/index.tsx @@ -8,6 +8,8 @@ import { RadioCard } from '@nemo/common/src/components/RadioCard'; import { getEntityReference } from '@nemo/common/src/namedEntity'; import { useToast } from '@nemo/common/src/providers/toast/useToast'; import { FILESET_NAME_MAX_LENGTH, FILESET_NAME_REGEXP } from '@nemo/common/src/utils/filesetName'; +import { checkDatasetQuality } from '@nemo/common/src/utils/datasetQuality'; +import type { DatasetQualityReport } from '@nemo/common/src/utils/datasetQuality'; import { filesUploadFile, getFilesListFilesetFilesQueryKey, @@ -59,7 +61,7 @@ import { storageConfigFromUrl, } from '@studio/util/storageConfigFromUrl'; import { QueryObserverResult, useQueryClient } from '@tanstack/react-query'; -import { FileCheck } from 'lucide-react'; +import { AlertTriangle, FileCheck, XCircle, CheckCircle2 } from 'lucide-react'; import { FC, useCallback, useMemo, useRef, useState } from 'react'; import { Controller, useForm } from 'react-hook-form'; import { useNavigate } from 'react-router-dom'; @@ -168,6 +170,59 @@ function toFileList(value: unknown): File[] { ); } +interface DatasetQualityReportViewProps { + report: DatasetQualityReport; +} + +const DatasetQualityReportView: FC = ({ report }) => { + const partialScanNote = report.scannedLines < report.totalLines && ( + + Scanned first {report.scannedLines.toLocaleString()} of{' '} + {report.totalLines.toLocaleString()} lines. + + ); + + if (!report.hasErrors && !report.hasWarnings) { + return ( + + + + {report.fileName}: all quality checks passed. + + {partialScanNote} + + ); + } + + return ( + + {report.fileName} + {report.issues.map((issue, idx) => ( + + {issue.severity === 'error' ? ( + + ) : ( + + )} + + {issue.message} + {issue.affectedLines && issue.affectedLines.length > 0 && ( + + {'Line' + (issue.affectedLines.length > 1 ? 's' : '') + ': '} + {issue.affectedLines.join(', ')} + {issue.count && issue.count > issue.affectedLines.length + ? ` (+${issue.count - issue.affectedLines.length} more)` + : ''} + + )} + + + ))} + {partialScanNote} + + ); +}; + export const FilesetNewRoute: FC = () => { const workspace = useWorkspaceFromPath(); const [activeTab, setActiveTab] = useState(DATASET_TYPE_CUSTOM); @@ -175,6 +230,8 @@ export const FilesetNewRoute: FC = () => { SAMPLE_DATASETS[0] ); const [isSubmitPending, setIsSubmitPending] = useState(false); + const [qualityReports, setQualityReports] = useState([]); + const [isValidating, setIsValidating] = useState(false); const navigate = useNavigate(); const toast = useToast(); const queryClient = useQueryClient(); @@ -205,6 +262,7 @@ export const FilesetNewRoute: FC = () => { }); const url = watch('url'); + const purpose = watch('purpose'); const selectedSecretName = watch('secretKey'); const secretKeyLabel = useMemo(() => { if (!url?.trim()) return 'Secret Key'; @@ -272,6 +330,33 @@ export const FilesetNewRoute: FC = () => { [] ); + /** + * Runs dataset quality checks on newly selected JSONL files and updates the report state. + * Only runs when purpose is 'dataset'; clears reports for other purposes or non-JSONL files. + */ + const handleFilesChange = useCallback( + async (files: File[]) => { + setValue('files', files, { shouldValidate: false }); + + if (purpose !== FilesetPurpose.dataset) { + setQualityReports([]); + return; + } + + const jsonlFiles = files.filter((f) => f.name.endsWith('.jsonl')); + if (jsonlFiles.length === 0) { + setQualityReports([]); + return; + } + + setIsValidating(true); + const reports = await Promise.all(jsonlFiles.map(checkDatasetQuality)); + setQualityReports(reports); + setIsValidating(false); + }, + [purpose, setValue] + ); + // Sync hidden name/description when a sample is selected (sample tab = simulated local form) const handleSelectSample = useCallback( (dataset: SampleDataset) => { @@ -282,10 +367,11 @@ export const FilesetNewRoute: FC = () => { [workspace, setValue] ); - // When switching tabs, reset the opposite tab’s form state so we don’t leak values + // When switching tabs, reset the opposite tab's form state so we don't leak values const handleTabChange = useCallback( (value: DatasetType) => { setActiveTab(value); + setQualityReports([]); if (value === DATASET_TYPE_CUSTOM) { setValue('name', '', { shouldValidate: false }); setValue('description', '', { shouldValidate: false }); @@ -311,6 +397,8 @@ export const FilesetNewRoute: FC = () => { ] ); + const hasValidationErrors = qualityReports.some((r) => r.hasErrors); + const onSubmit = useCallback( async (data: DatasetFormFields) => { const { success, error } = DatasetCreateFilesetFormSchema.safeParse(data); @@ -319,6 +407,11 @@ export const FilesetNewRoute: FC = () => { return; } + if (hasValidationErrors) { + toast.error('Fix dataset validation errors before creating this fileset.'); + return; + } + setIsSubmitPending(true); // Step 1 (sample only): fetch sample files via lazy query @@ -427,6 +520,7 @@ export const FilesetNewRoute: FC = () => { activeTab, createFilesetStep, getValues, + hasValidationErrors, navigate, storageTab, toast, @@ -464,7 +558,7 @@ export const FilesetNewRoute: FC = () => {