modelscope · tastelikefeet · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md
@@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta
 dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500)))
 ```
 
+If using a local path or a local file, please follow these instructions:
+
+1. If you are using a local dataset file, pass a single file path (better to be an absolute path to avoid relative path errors), list is not supported.
+2. If you are using a local dir, please make sure files in the path share the same data structure, and the file extensions.
+3. We use `datasets` library to do data loading, check the support extensions [here](https://huggingface.co/docs/hub/datasets-libraries).
+
 2. Setting template
 
 The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process.

diff --git a/docs/source_zh/组件/数据集/Dataset.md b/docs/source_zh/组件/数据集/Dataset.md
@@ -55,6 +55,12 @@ from twinkle.dataset import Dataset, DatasetMeta
 dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=range(1500)))
 ```
 
+如果使用本地路径或本地文件，请遵循以下说明：
+
+1. 如果使用的是本地数据集文件，请传入单个文件路径（最好使用绝对路径以避免相对路径错误），不支持传入列表。
+2. 如果使用的是本地目录，请确保目录中的文件具有相同的数据结构和文件扩展名。
+3. 我们使用 `datasets` 库进行数据加载，支持的扩展名请查看[此处](https://huggingface.co/docs/hub/datasets-libraries)。
+
 2. 设置 template
 
 Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。

diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
@@ -120,15 +120,25 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
             if os.path.exists(dataset_id):
                 streaming = kwargs.get('streaming', False)
                 num_proc = kwargs.get('num_proc', 1)
-                ext = os.path.splitext(dataset_id)[1].lstrip('.')
-                file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
                 if streaming:
                     kwargs = {'split': 'train', 'streaming': True}
                 else:
                     kwargs = {'split': 'train', 'num_proc': num_proc}
-                if file_type == 'csv':
-                    kwargs['na_filter'] = False
-                dataset = load_dataset(file_type, data_files=dataset_id, **kwargs)
+                if os.path.isdir(dataset_id):
+                    folder_path = dataset_id
+                    files = os.listdir(folder_path)
+                    first_file = files[0] if files else None
+                    ext = os.path.splitext(first_file)[1].lstrip('.')
+                    file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
+                    if file_type == 'csv':
+                        kwargs['na_filter'] = False
+                    dataset = load_dataset(file_type, data_dir=dataset_id, **kwargs)
+                else:
+                    ext = os.path.splitext(dataset_id)[1].lstrip('.')
+                    file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
+                    if file_type == 'csv':
+                        kwargs['na_filter'] = False
+                    dataset = load_dataset(file_type, data_files=dataset_id, **kwargs)
             else:
                 dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)
 

diff --git a/tests/dataset/test_data/1.lance b/tests/dataset/test_data/1.lance
diff --git a/tests/dataset/test_data/lance/1.lance b/tests/dataset/test_data/lance/1.lance
diff --git a/tests/dataset/test_loading.py b/tests/dataset/test_loading.py
@@ -41,6 +41,18 @@ def test_load_local_json(self):
         assert dataset[0]['text'] == 'Hello world'
         assert dataset[0]['label'] == 0
 
+    def test_load_local_lance(self):
+        """Test loading local Lance file"""
+        lance_path = str(TEST_DATA_DIR / '1.lance')
+        dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path))
+        assert len(dataset) == 2
+
+    def test_load_local_lance_dir(self):
+        """Test loading local Lance dir"""
+        lance_path = str(TEST_DATA_DIR / 'lance')
+        dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path))
+        assert len(dataset) == 2
+
     def test_load_local_jsonl(self):
         jsonl_path = str(TEST_DATA_DIR / 'test.jsonl')
         dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=jsonl_path))