From 2fe0ab3093c8df38100fd1f979396f918d6c4f5c Mon Sep 17 00:00:00 2001 From: Arturo Fredes Date: Thu, 19 Mar 2026 09:14:29 +0000 Subject: [PATCH] fix(synthetic): align _SyntheticTextExamplesIterable with HuggingFace datasets API - Add n_shards property (alias of num_shards). HuggingFace IterableDataset expects n_shards; without it, loading synthetic data raises NotImplementedError. - Change shard_data_sources(worker_id, num_workers) to match the base _BaseExamplesIterable API; the previous signature (num_shards, index, contiguous) caused 'unexpected keyword argument worker_id' when using multiple DataLoader workers. Fixes synthetic data loader when used with datasets.IterableDataset. Signed-off-by: Arturo Fredes --- src/guidellm/data/deserializers/synthetic.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/guidellm/data/deserializers/synthetic.py b/src/guidellm/data/deserializers/synthetic.py index a97b010af..797a18527 100644 --- a/src/guidellm/data/deserializers/synthetic.py +++ b/src/guidellm/data/deserializers/synthetic.py @@ -105,6 +105,11 @@ def features(self) -> Features: def num_shards(self) -> int: return 1 + @property + def n_shards(self) -> int: + """Alias for num_shards; required by HuggingFace datasets IterableDataset API.""" + return self.num_shards + def shuffle_data_sources( self, generator: np.random.Generator, # noqa: ARG002 @@ -114,11 +119,10 @@ def shuffle_data_sources( def shard_data_sources( self, - num_shards: int, # noqa: ARG002 - index: int, # noqa: ARG002 - contiguous: bool = True, # noqa: ARG002 + worker_id: int, # noqa: ARG002 + num_workers: int, # noqa: ARG002 ) -> _SyntheticTextExamplesIterable: - """Return self since synthetic data generation is infinite and stateless.""" + """HuggingFace API: return self since synthetic data is infinite and stateless.""" return self def load_state_dict(self, state_dict: dict) -> None: