From 1ef32b6b29669bad4604e1cbd63772184118ee37 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 6 Mar 2026 14:55:26 +0800
Subject: [PATCH 1/2] fix

---
 docs/source_en/Components/Dataset/Dataset.md  |   6 +++++
 .../Dataset.md"                               |   6 +++++
 src/twinkle/dataset/base.py                   |  22 ++++++++++++++----
 tests/dataset/test_data/1.lance               | Bin 0 -> 575 bytes
 tests/dataset/test_data/lance/1.lance         | Bin 0 -> 575 bytes
 tests/dataset/test_loading.py                 |  12 ++++++++++
 6 files changed, 41 insertions(+), 5 deletions(-)
 create mode 100644 tests/dataset/test_data/1.lance
 create mode 100644 tests/dataset/test_data/lance/1.lance

diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md
index aea56968..1e896a11 100644
--- a/docs/source_en/Components/Dataset/Dataset.md
+++ b/docs/source_en/Components/Dataset/Dataset.md
@@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta
 dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500)))
 ```
 
+If using a local path or a local file, please follow these instructions:
+
+1. If you are using a local dataset file, pass a single file path (better to be an absolute path to avoid relative path errors), list is not supported.
+2. If you are using a local dir, please make sure files in the path share the same data structure, and the file extensions.
+3. We use `datasets` library to do data loading, check the support extensions [here](https://huggingface.co/docs/hub/datasets-libraries).
+
 2. Setting template
 
 The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process.
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md"
index 780ddf17..322c0e34 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md"
@@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta
 dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500)))
 ```
 
+如果使用本地路径或本地文件，请遵循以下说明：
+
+1. 如果使用的是本地数据集文件，请传入单个文件路径（最好使用绝对路径以避免相对路径错误），不支持传入列表。
+2. 如果使用的是本地目录，请确保目录中的文件具有相同的数据结构和文件扩展名。
+3. 我们使用 `datasets` 库进行数据加载，支持的扩展名请查看[此处](https://huggingface.co/docs/hub/datasets-libraries)。
+
 2. 设置 template
 
 Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 10b3bb92..8563edd0 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -120,15 +120,27 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
             if os.path.exists(dataset_id):
                 streaming = kwargs.get('streaming', False)
                 num_proc = kwargs.get('num_proc', 1)
-                ext = os.path.splitext(dataset_id)[1].lstrip('.')
-                file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
                 if streaming:
                     kwargs = {'split': 'train', 'streaming': True}
                 else:
                     kwargs = {'split': 'train', 'num_proc': num_proc}
-                if file_type == 'csv':
-                    kwargs['na_filter'] = False
-                dataset = load_dataset(file_type, data_files=dataset_id, **kwargs)
+                if os.path.isdir(dataset_id):
+                    folder_path = dataset_id
+                    files = os.listdir(folder_path)
+                    first_file = files[0] if files else None
+                    ext = os.path.splitext(first_file)[1].lstrip('.')
+                    file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
+                    if file_type == 'csv':
+                        kwargs['na_filter'] = False
+                    kwargs['path'] = ext
+                    dataset = load_dataset(data_dir=dataset_id, **kwargs)
+                else:
+                    ext = os.path.splitext(dataset_id)[1].lstrip('.')
+                    file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
+                    if file_type == 'csv':
+                        kwargs['na_filter'] = False
+                    kwargs['path'] = ext
+                    dataset = load_dataset(data_files=dataset_id, **kwargs)
             else:
                 dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)
 
diff --git a/tests/dataset/test_data/1.lance b/tests/dataset/test_data/1.lance
new file mode 100644
index 0000000000000000000000000000000000000000..0203c29aed5dd29557c44905f443d9da1e6d9187
GIT binary patch
literal 575
zcmZQ&fB+6C?Li`N%*jknb;?g7Q5#&j2-IFVGVSNG<B}C($xF;lRrn7Bj9P5PB}JKe
z=?07zOh9So#Pn1NMr3)`%)Ama6QDGs027y{kUEztm%M&XVqS8pUTR))ehN@;v7U2&
zPHAqQE0iz9#Kj=w$Hm0pAjBlVA;F~NB;)|JTn<B<V^L9JC0v(`krWq?5EmB*7aJD~
zhl3D<5Rk>eA;iSND8WgJ)wWzr4U9sJ0zjLMgbcX!2-=KdD-$Cme5;|sTnpuALHP%u
av=fwusecYt$Ib+i0C6EqW(FTeKW6~dc!bUX

literal 0
HcmV?d00001

diff --git a/tests/dataset/test_data/lance/1.lance b/tests/dataset/test_data/lance/1.lance
new file mode 100644
index 0000000000000000000000000000000000000000..0203c29aed5dd29557c44905f443d9da1e6d9187
GIT binary patch
literal 575
zcmZQ&fB+6C?Li`N%*jknb;?g7Q5#&j2-IFVGVSNG<B}C($xF;lRrn7Bj9P5PB}JKe
z=?07zOh9So#Pn1NMr3)`%)Ama6QDGs027y{kUEztm%M&XVqS8pUTR))ehN@;v7U2&
zPHAqQE0iz9#Kj=w$Hm0pAjBlVA;F~NB;)|JTn<B<V^L9JC0v(`krWq?5EmB*7aJD~
zhl3D<5Rk>eA;iSND8WgJ)wWzr4U9sJ0zjLMgbcX!2-=KdD-$Cme5;|sTnpuALHP%u
av=fwusecYt$Ib+i0C6EqW(FTeKW6~dc!bUX

literal 0
HcmV?d00001

diff --git a/tests/dataset/test_loading.py b/tests/dataset/test_loading.py
index 1ce1bf09..34bdaf54 100644
--- a/tests/dataset/test_loading.py
+++ b/tests/dataset/test_loading.py
@@ -41,6 +41,18 @@ def test_load_local_json(self):
         assert dataset[0]['text'] == 'Hello world'
         assert dataset[0]['label'] == 0
 
+    def test_load_local_lance(self):
+        """Test loading local Lance file"""
+        lance_path = str(TEST_DATA_DIR / '1.lance')
+        dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path))
+        assert len(dataset) == 2
+
+    def test_load_local_lance_dir(self):
+        """Test loading local Lance dir"""
+        lance_path = str(TEST_DATA_DIR / 'lance')
+        dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path))
+        assert len(dataset) == 2
+
     def test_load_local_jsonl(self):
         jsonl_path = str(TEST_DATA_DIR / 'test.jsonl')
         dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=jsonl_path))

From a4398186081d1238a5442b6240db489ac1d64986 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 6 Mar 2026 14:57:40 +0800
Subject: [PATCH 2/2] fix

---
 src/twinkle/dataset/base.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 8563edd0..ae38f630 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -132,15 +132,13 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
                     file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
                     if file_type == 'csv':
                         kwargs['na_filter'] = False
-                    kwargs['path'] = ext
-                    dataset = load_dataset(data_dir=dataset_id, **kwargs)
+                    dataset = load_dataset(file_type, data_dir=dataset_id, **kwargs)
                 else:
                     ext = os.path.splitext(dataset_id)[1].lstrip('.')
                     file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
                     if file_type == 'csv':
                         kwargs['na_filter'] = False
-                    kwargs['path'] = ext
-                    dataset = load_dataset(data_files=dataset_id, **kwargs)
+                    dataset = load_dataset(file_type, data_files=dataset_id, **kwargs)
             else:
                 dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)