Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -321,19 +321,21 @@ def discover_dataset_files(dataset_path: Path) -> tuple[list[Path], list[Path]]:
# Discover validation files
val_files = _discover_files_by_patterns(dataset_path, VAL_PATTERNS, VAL_DIRS)

# Fallback: if no files found with patterns, check for any .jsonl files
# Fallback: if no files found with patterns, check for any .jsonl/.json files
if not train_files and not val_files:
all_jsonl = sorted(f for f in dataset_path.glob("*.jsonl") if f.is_file())
if len(all_jsonl) == 1:
logger.info(f"Found single JSONL file, treating as training data: {all_jsonl[0]}")
train_files = all_jsonl
elif len(all_jsonl) > 1:
all_data = sorted(
f for f in dataset_path.iterdir() if f.is_file() and f.suffix.lower() in (".jsonl", ".json")
)
if len(all_data) == 1:
logger.info(f"Found single JSON/JSONL file, treating as training data: {all_data[0]}")
train_files = all_data
elif len(all_data) > 1:
# Ambiguous - could be train/val or multiple training files
logger.warning(
f"Found {len(all_jsonl)} JSONL files without clear train/val naming. "
f"Treating all as training data: {[f.name for f in all_jsonl]}"
f"Found {len(all_data)} JSON/JSONL files without clear train/val naming. "
f"Treating all as training data: {[f.name for f in all_data]}"
)
train_files = all_jsonl
train_files = all_data

if not train_files:
raise DatasetFormatError(
Expand Down
34 changes: 34 additions & 0 deletions services/customizer/tests/tasks/training/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,40 @@ def test_fallback_single_jsonl_as_training(self, dataset_dir: Path):
assert len(train_files) == 1
assert len(val_files) == 0

def test_fallback_lone_json_as_training(self, dataset_dir: Path):
"""Lone root .json file (no train/val pattern) is claimed as training."""
file_path = dataset_dir / "data.json"
_write_jsonl(file_path, [{"a": 1}])

train_files, val_files = discover_dataset_files(dataset_dir)

assert len(train_files) == 1
assert train_files[0].name == "data.json"
assert len(val_files) == 0

def test_fallback_mixed_json_and_jsonl_both_claimed(self, dataset_dir: Path):
"""Lone root .jsonl + .json (mixed, no pattern) → both claimed as training."""
jsonl_file = dataset_dir / "my_data.jsonl"
json_file = dataset_dir / "extra.json"
_write_jsonl(jsonl_file, [{"a": 1}])
_write_jsonl(json_file, [{"a": 2}])

train_files, val_files = discover_dataset_files(dataset_dir)

assert len(train_files) == 2
assert {f.name for f in train_files} == {"my_data.jsonl", "extra.json"}
assert len(val_files) == 0

def test_fallback_multiple_json_all_claimed_as_training(self, dataset_dir: Path):
"""Multiple root .json files (no pattern) → all claimed as training with warning."""
for name in ("a.json", "b.json", "c.json"):
_write_jsonl(dataset_dir / name, [{"a": 1}])

train_files, val_files = discover_dataset_files(dataset_dir)

assert len(train_files) == 3
assert len(val_files) == 0

def test_raises_when_no_files_found(self, dataset_dir: Path):
"""Test error when no training files found."""
with pytest.raises(DatasetFormatError, match="No training files found"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,16 @@ describe('partitionDatasetFiles', () => {
expect(paths(result.unmatchedRootJsonl).sort()).toEqual(['a.jsonl', 'b.jsonl']);
});

it('does NOT park .json (non-jsonl) root files in the unmatched bucket', () => {
// The lone-root fallback is .jsonl-only; .json without train/val pattern is ignored.
it('parks unmatched root .json files in unmatchedRootJsonl (matches customizer fallback)', () => {
const result = partitionDatasetFiles([file('thing.json')]);
expect(result.unmatchedRootJsonl).toEqual([]);
expect(paths(result.unmatchedRootJsonl)).toEqual(['thing.json']);
});

it('parks mixed unmatched root .json and .jsonl together in unmatchedRootJsonl', () => {
const result = partitionDatasetFiles([file('a.jsonl'), file('b.json')]);
expect(result.training).toEqual([]);
expect(result.validation).toEqual([]);
expect(paths(result.unmatchedRootJsonl).sort()).toEqual(['a.jsonl', 'b.json']);
});

it('ignores files with non-dataset extensions', () => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ export const partitionDatasetFiles = (files: FilesetFileOutput[]): PartitionedFi
training.push(f);
} else if (matchesAnyPattern(filename, validationRule.filePatterns)) {
validation.push(f);
} else if (filename.endsWith('.jsonl')) {
// Only .jsonl is eligible for the lone-root fallback (matches customizer).
} else {
// Both .jsonl and .json are eligible for the lone-root fallback (matches customizer).
unmatchedRootJsonl.push(f);
}
}
Expand Down
Loading