Following tutorial from Mosh these are my notes with updates for 2025 version of the libraries
Save the model once trained, Scikit-learn models rely heavily on numpy arrays and Joblib is optimized to handle numpy arrays. This save routine is commonly used for scikit-learn models like DecisionTreeClassifier (used here), RandomForestClassifier, SVM, etc.
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']
# create, train and save model
model = DecisionTreeClassifier()
model.fit(X.values, y)
joblib.dump(model, 'music.joblib')
# OUTPUTS: this file `music.joblib` is then persisted in the same directory as the Juypter notebook
# ['music.joblib']Load up the model and make a prediction
import joblib
model = joblib.load('music.joblib')
# [21,1] is 21 year old male
predictions = model.predict([[21,1]])
predictions
# OUTPUTS: this is expected based on the music.csv file
# array(['HipHop'], dtype=object)We should allocate 70, 80% of the data for training and use the remaining as testing
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']
# X_train, X_test are input sets for training and testing
# y_train, y_test are output sets for training and testing, y_test are the expected values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 20% of data to test
# create and train a model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = accuracy_score(y_test, predictions)
score
# OUTPUTS: this will change because `train_test_split` selects random parts of the CSV and the sample size is only 18
# 1.0 -> means 100%
# 0.75 -> means 75%
# 0.5 -> means 50%Pass entire dataset
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']
# create and train a model
model = DecisionTreeClassifier()
model.fit(X.values, y)
# samples are passed to make predictions
# [21,1] is 21 year old male
# [22,0] is 22 year old female
predictions = model.predict([[21,1],[22,0]])
predictions
# OUTPUTS: this is expected based on the music.csv file
# array(['HipHop', 'Dance'], dtype=object)