project-spotify/preprocessing_tracks_metadata.py at main · mohammad-malik/project-spotify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd


def preprocess_data(file_path, files_in_dataset_path):
    # Load the list of track_ids from the "files_in_dataset" file
    with open(files_in_dataset_path, "r") as f:
        files_in_dataset = set(
            int(line.strip().removesuffix('.mp3')) for line in f
        )

    df = pd.read_csv(
        file_path,
        usecols=[
            "track_id",
            "track_title",
            "track_url",
            "artist_name",
            "album_title",
            "track_duration",
            "track_date_created",
            "track_genres",
        ],
    )

    # Only keep records where the track_id is in the "files_in_dataset" list
    df = df[df["track_id"].isin(files_in_dataset)]

    # removing characters such as ', (, ), and " from the track_title
    df["track_title"] = df["track_title"].str.replace(
        r"[',()\"\\-]",
        "",
        regex=True
    )

    df["track_date_created"] = pd.to_datetime(df["track_date_created"])
    df["track_date_created"] = df["track_date_created"].dt.strftime(
                                                        "%Y-%m-%d %H:%M:%S")

    df["track_duration"] = df["track_duration"].apply(
        lambda x: (
            int(x.split(":")[0]) * 60 + int(x.split(":")[1])
            if isinstance(x, str)
            else 0
        )
    )
    df["track_genres"] = df["track_genres"].apply(
        lambda x: (
            ", ".join([g["genre_title"] for g in eval(x)])
            if isinstance(x, str)
            else None
        )
    )

    df.to_csv("cleaned_tracks.csv", index=False)
    return df


tracks_df = preprocess_data("raw_tracks.csv", "files_in_dataset.txt")
print(f"Tracks found: {len(tracks_df)}")