From 1048bc5261e38a4e896b5fd0f96eb769c8b030ca Mon Sep 17 00:00:00 2001 From: Alex Ray Date: Thu, 11 Jun 2026 13:46:23 -0700 Subject: [PATCH] fix(customizer): lone-root fallback now accepts .json in addition to .jsonl The discover_dataset_files fallback glob was .jsonl-only, while every other discovery path (TRAIN_PATTERNS, VAL_PATTERNS, subdir walk) already treated .json and .jsonl as equivalent. A user uploading data.json at the fileset root with no train/val pattern in the name would hit a DatasetFormatError even though the file is valid. Fixes the fallback to use iterdir() with suffix check instead of glob("*.jsonl"), and mirrors the change in Studio's partitionDatasetFiles so the UI stays in sync with customizer's discovery rules. Closes ASTD-109 Signed-off-by: Alex Ray --- .../tasks/training/datasets/preparation.py | 20 ++++++----- .../tests/tasks/training/test_datasets.py | 34 +++++++++++++++++++ .../useDatasetFileDiscovery/index.spec.ts | 12 +++++-- .../hooks/useDatasetFileDiscovery/index.ts | 4 +-- 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/services/customizer/src/nmp/customizer/tasks/training/datasets/preparation.py b/services/customizer/src/nmp/customizer/tasks/training/datasets/preparation.py index ab68b4eff6..695fefb656 100644 --- a/services/customizer/src/nmp/customizer/tasks/training/datasets/preparation.py +++ b/services/customizer/src/nmp/customizer/tasks/training/datasets/preparation.py @@ -321,19 +321,21 @@ def discover_dataset_files(dataset_path: Path) -> tuple[list[Path], list[Path]]: # Discover validation files val_files = _discover_files_by_patterns(dataset_path, VAL_PATTERNS, VAL_DIRS) - # Fallback: if no files found with patterns, check for any .jsonl files + # Fallback: if no files found with patterns, check for any .jsonl/.json files if not train_files and not val_files: - all_jsonl = sorted(f for f in dataset_path.glob("*.jsonl") if f.is_file()) - if len(all_jsonl) == 1: - logger.info(f"Found single JSONL file, treating as training data: {all_jsonl[0]}") - train_files = all_jsonl - elif len(all_jsonl) > 1: + all_data = sorted( + f for f in dataset_path.iterdir() if f.is_file() and f.suffix.lower() in (".jsonl", ".json") + ) + if len(all_data) == 1: + logger.info(f"Found single JSON/JSONL file, treating as training data: {all_data[0]}") + train_files = all_data + elif len(all_data) > 1: # Ambiguous - could be train/val or multiple training files logger.warning( - f"Found {len(all_jsonl)} JSONL files without clear train/val naming. " - f"Treating all as training data: {[f.name for f in all_jsonl]}" + f"Found {len(all_data)} JSON/JSONL files without clear train/val naming. " + f"Treating all as training data: {[f.name for f in all_data]}" ) - train_files = all_jsonl + train_files = all_data if not train_files: raise DatasetFormatError( diff --git a/services/customizer/tests/tasks/training/test_datasets.py b/services/customizer/tests/tasks/training/test_datasets.py index f3021c0761..d8d59ccda2 100644 --- a/services/customizer/tests/tasks/training/test_datasets.py +++ b/services/customizer/tests/tasks/training/test_datasets.py @@ -315,6 +315,40 @@ def test_fallback_single_jsonl_as_training(self, dataset_dir: Path): assert len(train_files) == 1 assert len(val_files) == 0 + def test_fallback_lone_json_as_training(self, dataset_dir: Path): + """Lone root .json file (no train/val pattern) is claimed as training.""" + file_path = dataset_dir / "data.json" + _write_jsonl(file_path, [{"a": 1}]) + + train_files, val_files = discover_dataset_files(dataset_dir) + + assert len(train_files) == 1 + assert train_files[0].name == "data.json" + assert len(val_files) == 0 + + def test_fallback_mixed_json_and_jsonl_both_claimed(self, dataset_dir: Path): + """Lone root .jsonl + .json (mixed, no pattern) → both claimed as training.""" + jsonl_file = dataset_dir / "my_data.jsonl" + json_file = dataset_dir / "extra.json" + _write_jsonl(jsonl_file, [{"a": 1}]) + _write_jsonl(json_file, [{"a": 2}]) + + train_files, val_files = discover_dataset_files(dataset_dir) + + assert len(train_files) == 2 + assert {f.name for f in train_files} == {"my_data.jsonl", "extra.json"} + assert len(val_files) == 0 + + def test_fallback_multiple_json_all_claimed_as_training(self, dataset_dir: Path): + """Multiple root .json files (no pattern) → all claimed as training with warning.""" + for name in ("a.json", "b.json", "c.json"): + _write_jsonl(dataset_dir / name, [{"a": 1}]) + + train_files, val_files = discover_dataset_files(dataset_dir) + + assert len(train_files) == 3 + assert len(val_files) == 0 + def test_raises_when_no_files_found(self, dataset_dir: Path): """Test error when no training files found.""" with pytest.raises(DatasetFormatError, match="No training files found"): diff --git a/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.spec.ts b/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.spec.ts index d62c1fb924..233fd7b911 100644 --- a/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.spec.ts +++ b/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.spec.ts @@ -78,10 +78,16 @@ describe('partitionDatasetFiles', () => { expect(paths(result.unmatchedRootJsonl).sort()).toEqual(['a.jsonl', 'b.jsonl']); }); - it('does NOT park .json (non-jsonl) root files in the unmatched bucket', () => { - // The lone-root fallback is .jsonl-only; .json without train/val pattern is ignored. + it('parks unmatched root .json files in unmatchedRootJsonl (matches customizer fallback)', () => { const result = partitionDatasetFiles([file('thing.json')]); - expect(result.unmatchedRootJsonl).toEqual([]); + expect(paths(result.unmatchedRootJsonl)).toEqual(['thing.json']); + }); + + it('parks mixed unmatched root .json and .jsonl together in unmatchedRootJsonl', () => { + const result = partitionDatasetFiles([file('a.jsonl'), file('b.json')]); + expect(result.training).toEqual([]); + expect(result.validation).toEqual([]); + expect(paths(result.unmatchedRootJsonl).sort()).toEqual(['a.jsonl', 'b.json']); }); it('ignores files with non-dataset extensions', () => { diff --git a/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.ts b/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.ts index 79483c0523..00b2efb4d3 100644 --- a/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.ts +++ b/web/packages/studio/src/hooks/useDatasetFileDiscovery/index.ts @@ -64,8 +64,8 @@ export const partitionDatasetFiles = (files: FilesetFileOutput[]): PartitionedFi training.push(f); } else if (matchesAnyPattern(filename, validationRule.filePatterns)) { validation.push(f); - } else if (filename.endsWith('.jsonl')) { - // Only .jsonl is eligible for the lone-root fallback (matches customizer). + } else { + // Both .jsonl and .json are eligible for the lone-root fallback (matches customizer). unmatchedRootJsonl.push(f); } }