From 1ef32b6b29669bad4604e1cbd63772184118ee37 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 6 Mar 2026 14:55:26 +0800 Subject: [PATCH 1/2] fix --- docs/source_en/Components/Dataset/Dataset.md | 6 +++++ .../Dataset.md" | 6 +++++ src/twinkle/dataset/base.py | 22 ++++++++++++++---- tests/dataset/test_data/1.lance | Bin 0 -> 575 bytes tests/dataset/test_data/lance/1.lance | Bin 0 -> 575 bytes tests/dataset/test_loading.py | 12 ++++++++++ 6 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 tests/dataset/test_data/1.lance create mode 100644 tests/dataset/test_data/lance/1.lance diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md index aea56968..1e896a11 100644 --- a/docs/source_en/Components/Dataset/Dataset.md +++ b/docs/source_en/Components/Dataset/Dataset.md @@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500))) ``` +If using a local path or a local file, please follow these instructions: + +1. If you are using a local dataset file, pass a single file path (better to be an absolute path to avoid relative path errors), list is not supported. +2. If you are using a local dir, please make sure files in the path share the same data structure, and the file extensions. +3. We use `datasets` library to do data loading, check the support extensions [here](https://huggingface.co/docs/hub/datasets-libraries). + 2. Setting template The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process. diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" index 780ddf17..322c0e34 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" @@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500))) ``` +如果使用本地路径或本地文件,请遵循以下说明: + +1. 如果使用的是本地数据集文件,请传入单个文件路径(最好使用绝对路径以避免相对路径错误),不支持传入列表。 +2. 如果使用的是本地目录,请确保目录中的文件具有相同的数据结构和文件扩展名。 +3. 我们使用 `datasets` 库进行数据加载,支持的扩展名请查看[此处](https://huggingface.co/docs/hub/datasets-libraries)。 + 2. 设置 template Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。 diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py index 10b3bb92..8563edd0 100644 --- a/src/twinkle/dataset/base.py +++ b/src/twinkle/dataset/base.py @@ -120,15 +120,27 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs): if os.path.exists(dataset_id): streaming = kwargs.get('streaming', False) num_proc = kwargs.get('num_proc', 1) - ext = os.path.splitext(dataset_id)[1].lstrip('.') - file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext if streaming: kwargs = {'split': 'train', 'streaming': True} else: kwargs = {'split': 'train', 'num_proc': num_proc} - if file_type == 'csv': - kwargs['na_filter'] = False - dataset = load_dataset(file_type, data_files=dataset_id, **kwargs) + if os.path.isdir(dataset_id): + folder_path = dataset_id + files = os.listdir(folder_path) + first_file = files[0] if files else None + ext = os.path.splitext(first_file)[1].lstrip('.') + file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext + if file_type == 'csv': + kwargs['na_filter'] = False + kwargs['path'] = ext + dataset = load_dataset(data_dir=dataset_id, **kwargs) + else: + ext = os.path.splitext(dataset_id)[1].lstrip('.') + file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext + if file_type == 'csv': + kwargs['na_filter'] = False + kwargs['path'] = ext + dataset = load_dataset(data_files=dataset_id, **kwargs) else: dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs) diff --git a/tests/dataset/test_data/1.lance b/tests/dataset/test_data/1.lance new file mode 100644 index 0000000000000000000000000000000000000000..0203c29aed5dd29557c44905f443d9da1e6d9187 GIT binary patch literal 575 zcmZQ&fB+6C?Li`N%*jknb;?g7Q5#&j2-IFVGVSNGeA;iSND8WgJ)wWzr4U9sJ0zjLMgbcX!2-=KdD-$Cme5;|sTnpuALHP%u av=fwusecYt$Ib+i0C6EqW(FTeKW6~dc!bUX literal 0 HcmV?d00001 diff --git a/tests/dataset/test_data/lance/1.lance b/tests/dataset/test_data/lance/1.lance new file mode 100644 index 0000000000000000000000000000000000000000..0203c29aed5dd29557c44905f443d9da1e6d9187 GIT binary patch literal 575 zcmZQ&fB+6C?Li`N%*jknb;?g7Q5#&j2-IFVGVSNGeA;iSND8WgJ)wWzr4U9sJ0zjLMgbcX!2-=KdD-$Cme5;|sTnpuALHP%u av=fwusecYt$Ib+i0C6EqW(FTeKW6~dc!bUX literal 0 HcmV?d00001 diff --git a/tests/dataset/test_loading.py b/tests/dataset/test_loading.py index 1ce1bf09..34bdaf54 100644 --- a/tests/dataset/test_loading.py +++ b/tests/dataset/test_loading.py @@ -41,6 +41,18 @@ def test_load_local_json(self): assert dataset[0]['text'] == 'Hello world' assert dataset[0]['label'] == 0 + def test_load_local_lance(self): + """Test loading local Lance file""" + lance_path = str(TEST_DATA_DIR / '1.lance') + dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) + assert len(dataset) == 2 + + def test_load_local_lance_dir(self): + """Test loading local Lance dir""" + lance_path = str(TEST_DATA_DIR / 'lance') + dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) + assert len(dataset) == 2 + def test_load_local_jsonl(self): jsonl_path = str(TEST_DATA_DIR / 'test.jsonl') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=jsonl_path)) From a4398186081d1238a5442b6240db489ac1d64986 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 6 Mar 2026 14:57:40 +0800 Subject: [PATCH 2/2] fix --- src/twinkle/dataset/base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py index 8563edd0..ae38f630 100644 --- a/src/twinkle/dataset/base.py +++ b/src/twinkle/dataset/base.py @@ -132,15 +132,13 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs): file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext if file_type == 'csv': kwargs['na_filter'] = False - kwargs['path'] = ext - dataset = load_dataset(data_dir=dataset_id, **kwargs) + dataset = load_dataset(file_type, data_dir=dataset_id, **kwargs) else: ext = os.path.splitext(dataset_id)[1].lstrip('.') file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext if file_type == 'csv': kwargs['na_filter'] = False - kwargs['path'] = ext - dataset = load_dataset(data_files=dataset_id, **kwargs) + dataset = load_dataset(file_type, data_files=dataset_id, **kwargs) else: dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)