-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataset_Build.py
More file actions
57 lines (43 loc) · 1.81 KB
/
Dataset_Build.py
File metadata and controls
57 lines (43 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Imports
import hdf5
import os,hashlib
import pandas as pd
# Root Directory for Data
dir = "MillionSongSubset"
CSV = {}
# hd5 to python dict
def get_song_info(file):
song= hdf5.get_songs(file)
song_analysis = hdf5.get_song_analysis(file)
keys = list(song.coldescrs.keys())
analysis_keys = list(song_analysis.coldescrs.keys())
info ={}
for attr in keys:
if attr not in ["analyzer_version","genre"]:
info.update({attr:getattr(hdf5,"get_"+attr)(file)})
for attr in analysis_keys:
info.update({attr:getattr(hdf5,"get_"+attr)(file)})
return info
# Path for all songs
path_list= []
for subdir, dirs, files in os.walk(dir):
for file in files:
filepath = subdir + os.sep + file
if filepath.endswith(".h5"):
path_list.append(filepath)
count = 0
for path in path_list[1000:3000]:
file = hdf5.open_h5_file_read(path)
song = get_song_info(file)
CSV.update({count:song})
count+=1
file.close()
SongData=pd.DataFrame(CSV)
SongData=SongData.transpose()
print(SongData.head())
#Dropping columns before to save computational time
SongData.drop(['danceability', 'artist_latitude', 'artist_longitude', 'energy', 'artist_location', 'artist_id', 'artist_mbid', 'idx_artist_terms', 'idx_similar_artists', 'artist_7digitalid', 'title',
'track_7digitalid', 'artist_playmeid', 'release', 'release_7digitalid', 'analysis_sample_rate', 'audio_md5', 'track_id', 'artist_name', 'song_id', 'idx_sections_confidence',
'idx_sections_start', 'idx_bars_confidence', 'idx_bars_start', 'idx_beats_confidence', 'idx_beats_start', 'idx_segments_pitches', 'idx_segments_timbre', 'idx_segments_loudness_max_time', 'idx_tatums_confidence',
'idx_tatums_start'], inplace=True, axis=1)
SongData.to_pickle('SongSmall.pkl')