NVIDIA-NeMo · aray12 · Jun 11, 2026
@@ -321,19 +321,21 @@ def discover_dataset_files(dataset_path: Path) -> tuple[list[Path], list[Path]]:
     # Discover validation files
     val_files = _discover_files_by_patterns(dataset_path, VAL_PATTERNS, VAL_DIRS)
 
-    # Fallback: if no files found with patterns, check for any .jsonl files
+    # Fallback: if no files found with patterns, check for any .jsonl/.json files
     if not train_files and not val_files:
-        all_jsonl = sorted(f for f in dataset_path.glob("*.jsonl") if f.is_file())
-        if len(all_jsonl) == 1:
-            logger.info(f"Found single JSONL file, treating as training data: {all_jsonl[0]}")
-            train_files = all_jsonl
-        elif len(all_jsonl) > 1:
+        all_data = sorted(
+            f for f in dataset_path.iterdir() if f.is_file() and f.suffix.lower() in (".jsonl", ".json")
+        )
+        if len(all_data) == 1:
+            logger.info(f"Found single JSON/JSONL file, treating as training data: {all_data[0]}")
+            train_files = all_data
+        elif len(all_data) > 1:
             # Ambiguous - could be train/val or multiple training files
             logger.warning(
-                f"Found {len(all_jsonl)} JSONL files without clear train/val naming. "
-                f"Treating all as training data: {[f.name for f in all_jsonl]}"
+                f"Found {len(all_data)} JSON/JSONL files without clear train/val naming. "
+                f"Treating all as training data: {[f.name for f in all_data]}"
             )
-            train_files = all_jsonl
+            train_files = all_data
 
     if not train_files:
         raise DatasetFormatError(

@@ -315,6 +315,40 @@ def test_fallback_single_jsonl_as_training(self, dataset_dir: Path):
         assert len(train_files) == 1
         assert len(val_files) == 0
 
+    def test_fallback_lone_json_as_training(self, dataset_dir: Path):
+        """Lone root .json file (no train/val pattern) is claimed as training."""
+        file_path = dataset_dir / "data.json"
+        _write_jsonl(file_path, [{"a": 1}])
+
+        train_files, val_files = discover_dataset_files(dataset_dir)
+
+        assert len(train_files) == 1
+        assert train_files[0].name == "data.json"
+        assert len(val_files) == 0
+
+    def test_fallback_mixed_json_and_jsonl_both_claimed(self, dataset_dir: Path):
+        """Lone root .jsonl + .json (mixed, no pattern) → both claimed as training."""
+        jsonl_file = dataset_dir / "my_data.jsonl"
+        json_file = dataset_dir / "extra.json"
+        _write_jsonl(jsonl_file, [{"a": 1}])
+        _write_jsonl(json_file, [{"a": 2}])
+
+        train_files, val_files = discover_dataset_files(dataset_dir)
+
+        assert len(train_files) == 2
+        assert {f.name for f in train_files} == {"my_data.jsonl", "extra.json"}
+        assert len(val_files) == 0
+
+    def test_fallback_multiple_json_all_claimed_as_training(self, dataset_dir: Path):
+        """Multiple root .json files (no pattern) → all claimed as training with warning."""
+        for name in ("a.json", "b.json", "c.json"):
+            _write_jsonl(dataset_dir / name, [{"a": 1}])
+
+        train_files, val_files = discover_dataset_files(dataset_dir)
+
+        assert len(train_files) == 3
+        assert len(val_files) == 0
+
     def test_raises_when_no_files_found(self, dataset_dir: Path):
         """Test error when no training files found."""
         with pytest.raises(DatasetFormatError, match="No training files found"):

@@ -78,10 +78,16 @@ describe('partitionDatasetFiles', () => {
     expect(paths(result.unmatchedRootJsonl).sort()).toEqual(['a.jsonl', 'b.jsonl']);
   });
 
-  it('does NOT park .json (non-jsonl) root files in the unmatched bucket', () => {
-    // The lone-root fallback is .jsonl-only; .json without train/val pattern is ignored.
+  it('parks unmatched root .json files in unmatchedRootJsonl (matches customizer fallback)', () => {
     const result = partitionDatasetFiles([file('thing.json')]);
-    expect(result.unmatchedRootJsonl).toEqual([]);
+    expect(paths(result.unmatchedRootJsonl)).toEqual(['thing.json']);
+  });
+
+  it('parks mixed unmatched root .json and .jsonl together in unmatchedRootJsonl', () => {
+    const result = partitionDatasetFiles([file('a.jsonl'), file('b.json')]);
+    expect(result.training).toEqual([]);
+    expect(result.validation).toEqual([]);
+    expect(paths(result.unmatchedRootJsonl).sort()).toEqual(['a.jsonl', 'b.json']);
   });
 
   it('ignores files with non-dataset extensions', () => {

@@ -64,8 +64,8 @@ export const partitionDatasetFiles = (files: FilesetFileOutput[]): PartitionedFi
       training.push(f);
     } else if (matchesAnyPattern(filename, validationRule.filePatterns)) {
       validation.push(f);
-    } else if (filename.endsWith('.jsonl')) {
-      // Only .jsonl is eligible for the lone-root fallback (matches customizer).
+    } else {
+      // Both .jsonl and .json are eligible for the lone-root fallback (matches customizer).
       unmatchedRootJsonl.push(f);
     }
   }