-
Notifications
You must be signed in to change notification settings - Fork 277
Expand file tree
/
Copy pathdataset_downloader.py
More file actions
89 lines (75 loc) · 3.11 KB
/
dataset_downloader.py
File metadata and controls
89 lines (75 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
GTZAN Dataset Setup Utility
----------------------------------
This script automates the acquisition and organization of the GTZAN dataset
for the TSOAI (The Sound of AI) courses.
Functionality:
1. Downloads the latest dataset version directly from Hugging Face.
2. Extracts the compressed archive into the local project structure.
3. Sanitizes the dataset by removing hidden MacOS metadata artifacts
(._ files) to prevent processing errors in librosa.
4. Performs automatic cleanup of temporary download files.
"""
import os
import requests
import tarfile
from pathlib import Path
URL = "https://huggingface.co/datasets/marsyas/gtzan/resolve/main/data/genres.tar.gz"
# Anchors everything to the directory of the current file
BASE_DIR = Path(__file__).resolve().parent
DATASET_PATH = BASE_DIR / "GTZAN_dataset" # Root folder
DATASET_FILE = BASE_DIR / "genres.tar.gz" # Zip file
def clean_macos_artifacts(folder_path):
"""Recursively removes hidden MacOS metadata files (starting with ._)"""
print(f"\n3️⃣ Cleaning MacOS metadata artifacts in {folder_path} ...")
count = 0
for root, _, files in os.walk(folder_path):
for file in files:
if file.startswith("._"):
full_path = os.path.join(root, file)
try:
os.remove(full_path)
count += 1
except OSError as e:
print(f" Failed to remove {file}: {e}")
if count > 0:
print(f" 🧹 Removed {count} hidden '._' ghost files.")
else:
print(" ✨ No artifacts found. Directory is clean.")
def download_direct():
# This calculates the full path based on where you run the script from
absolute_path = os.path.abspath(DATASET_PATH)
print()
print("="*80)
print("Starting automated dataset downloader script for GTZAN for TSOAI courses!")
print("="*80)
# 1. Download
print(f"\n1️⃣ Downloading dataset from Hugging Face.…\n⏳ Please wait until it's finished, it could take a while :)")
try:
response = requests.get(URL, stream=True)
response.raise_for_status() # Check for download errors
with open(DATASET_FILE, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk: f.write(chunk)
print("☑️ Completed dataset download as: ", DATASET_FILE)
except Exception as e:
print(f"❌ Error downloading file: {e}")
return
# 2. Extract
print(f"\n2️⃣ Extracting files into destination folder …\n {absolute_path}")
try:
with tarfile.open(DATASET_FILE, "r:gz") as tar:
tar.extractall(path=DATASET_PATH)
except Exception as e:
print(f"❌ Error extracting file: {e}")
return
print(f"\n3️⃣ Finishisng cleaning process …")
# 3. Cleanup MacOS artifacts
clean_macos_artifacts(DATASET_PATH)
# 4. Remove zip file
if os.path.exists(DATASET_FILE):
os.remove(DATASET_FILE)
print("\n","="*80)
print("✅ Dataset was downloaded sucessfully.\n\n")
if __name__ == "__main__":
download_direct()