diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md index aea56968..1e896a11 100644 --- a/docs/source_en/Components/Dataset/Dataset.md +++ b/docs/source_en/Components/Dataset/Dataset.md @@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500))) ``` +If using a local path or a local file, please follow these instructions: + +1. If you are using a local dataset file, pass a single file path (better to be an absolute path to avoid relative path errors), list is not supported. +2. If you are using a local dir, please make sure files in the path share the same data structure, and the file extensions. +3. We use `datasets` library to do data loading, check the support extensions [here](https://huggingface.co/docs/hub/datasets-libraries). + 2. Setting template The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process. diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" index 780ddf17..322c0e34 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" @@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500))) ``` +如果使用本地路径或本地文件,请遵循以下说明: + +1. 如果使用的是本地数据集文件,请传入单个文件路径(最好使用绝对路径以避免相对路径错误),不支持传入列表。 +2. 如果使用的是本地目录,请确保目录中的文件具有相同的数据结构和文件扩展名。 +3. 我们使用 `datasets` 库进行数据加载,支持的扩展名请查看[此处](https://huggingface.co/docs/hub/datasets-libraries)。 + 2. 设置 template Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。 diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py index 10b3bb92..ae38f630 100644 --- a/src/twinkle/dataset/base.py +++ b/src/twinkle/dataset/base.py @@ -120,15 +120,25 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs): if os.path.exists(dataset_id): streaming = kwargs.get('streaming', False) num_proc = kwargs.get('num_proc', 1) - ext = os.path.splitext(dataset_id)[1].lstrip('.') - file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext if streaming: kwargs = {'split': 'train', 'streaming': True} else: kwargs = {'split': 'train', 'num_proc': num_proc} - if file_type == 'csv': - kwargs['na_filter'] = False - dataset = load_dataset(file_type, data_files=dataset_id, **kwargs) + if os.path.isdir(dataset_id): + folder_path = dataset_id + files = os.listdir(folder_path) + first_file = files[0] if files else None + ext = os.path.splitext(first_file)[1].lstrip('.') + file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext + if file_type == 'csv': + kwargs['na_filter'] = False + dataset = load_dataset(file_type, data_dir=dataset_id, **kwargs) + else: + ext = os.path.splitext(dataset_id)[1].lstrip('.') + file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext + if file_type == 'csv': + kwargs['na_filter'] = False + dataset = load_dataset(file_type, data_files=dataset_id, **kwargs) else: dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs) diff --git a/tests/dataset/test_data/1.lance b/tests/dataset/test_data/1.lance new file mode 100644 index 00000000..0203c29a Binary files /dev/null and b/tests/dataset/test_data/1.lance differ diff --git a/tests/dataset/test_data/lance/1.lance b/tests/dataset/test_data/lance/1.lance new file mode 100644 index 00000000..0203c29a Binary files /dev/null and b/tests/dataset/test_data/lance/1.lance differ diff --git a/tests/dataset/test_loading.py b/tests/dataset/test_loading.py index 1ce1bf09..34bdaf54 100644 --- a/tests/dataset/test_loading.py +++ b/tests/dataset/test_loading.py @@ -41,6 +41,18 @@ def test_load_local_json(self): assert dataset[0]['text'] == 'Hello world' assert dataset[0]['label'] == 0 + def test_load_local_lance(self): + """Test loading local Lance file""" + lance_path = str(TEST_DATA_DIR / '1.lance') + dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) + assert len(dataset) == 2 + + def test_load_local_lance_dir(self): + """Test loading local Lance dir""" + lance_path = str(TEST_DATA_DIR / 'lance') + dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) + assert len(dataset) == 2 + def test_load_local_jsonl(self): jsonl_path = str(TEST_DATA_DIR / 'test.jsonl') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=jsonl_path))